import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class Template extends Configured implements Tool {
public static class M extends
Mapper<LongWritable, Text, LongWritable, Text> {
@Override
protected void map(LongWritable key, Text values, Context context)
throws IOException, InterruptedException {
}
}
public static class R extends
Reducer<LongWritable, Text, LongWritable, Text> {
@Override
protected void reduce(LongWritable key, Iterable<Text> values,
Context context) throws IOException, InterruptedException {
}
}
public static class P extends Partitioner<Text, LongWritable> {
@Override
public int getPartition(Text key, LongWritable value, int parts) {
int hash = key.toString().hashCode();
return (hash & Integer.MAX_VALUE) % parts;
}
}
public static class G implements RawComparator<Text> {
public int compare(Text o1, Text o2) {
return 0;
}
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
return 0;
}
}
public static class C implements RawComparator<Text> {
public int compare(Text o1, Text o2) {
return 0;
}
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
return 0;
}
}
public int run(String[] args) throws Exception {
Job job = new Job(getConf(), "Template this is!");
job.setJarByClass(Template.class);
job.setMapperClass(M.class);
job.setCombinerClass(R.class);
job.setReducerClass(R.class);
job.setPartitionerClass(P.class);
job.setGroupingComparatorClass(G.class);
job.setSortComparatorClass(C.class);
FileInputFormat.addInputPaths(job, args[0]);
// job.setInputFormatClass(LzoTextInputFormat.class);
// LzoTextInputFormat.addInputPaths(job, args[0]);
job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(Text.class);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// job.setOutputFormatClass(TextOutputFormat.class);
// TextOutputFormat.setOutputPath(job, new Path(args[1]));
// TextOutputFormat.setCompressOutput(job, true);
// TextOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
// job.setOutputFormatClass(SequenceFileOutputFormat.class);
// SequenceFileOutputFormat.setOutputPath(job, new Path(args[1]));
// SequenceFileOutputFormat.setCompressOutput(job, true);
// SequenceFileOutputFormat.setOutputCompressorClass(job,
// GzipCodec.class);
// SequenceFileOutputFormat.setOutputCompressionType(job,
// CompressionType.BLOCK);
boolean successful = job.waitForCompletion(true);
System.out.println(job.getJobID()
+ (successful ? " :successful" : " :failed"));
return successful ? 0 : 1;
}
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
System.out.println("Hello World!");
System.exit(ToolRunner.run(new Configuration(), new Template(), args));
}
}
关于本地测试
There are a few approaches to debugging your Hadoop MapReduce:
You can use MRUnit (http://www.cloudera.com/hadoop-mrunit) to write tests for your MapReduce. You'll be able to do this within Eclipse, so it's easy to debug the specific logic.
You can set your jobtracker to "local" (mapred.job.tracker is the config variable), which then runs the maps and the reducer all in the same VM as the job submission. You can trigger this from Eclipse, and set breakpoints, etc. See http://wiki.apache.org/hadoop/HowToDe... .
The trickiest approach is to attach a debugger to job that Hadoop spawns. This isn't as simple as attaching a debugger to TaskTracker, since the TaskTracker itself forks off a new subprocess (called Child) to run your tasks. You can use mapred.child.java.opts to add -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8020 to the child tasks. Then, you'll be able to attach Eclipse (or any other Java debugger) to port 8020.
The reason you're not seeing the Hadoop daemons with jps is that they're probably as the user hadoop. If you use sudo jps, you'll probably see those processes.
http://wiki.apache.org/hadoop/HowToDebugMapReducePrograms
以下 jvm参数,酌量添加
-Dfile.encoding=UTF-8 -Duser.language=zh -Xms1024m -Xmx1024m -XX:PermSize=64M -XX:MaxPermSize=128m -XX:MaxNewSize=256m -Djava.awt.headless=true