在公司同事的帮助下写了一个word count程序,与hadoop官网上面的相比就是带了几个参数,使用时用户可以设定参数,并且还可以实时显示处理的进度(用token的形式反映)。
总共就两个类,一个是配置接口Setting
package wordCount;
public interface Settings {
public static final String HELP_OPTION = "help";
public static final String PATH_INDICATOR = "path";
public static final String STRING_INDICATOR = "string";
public static final String INTEGER_INDICATOR = "int";
public static final String INPUT_OPTION = "input";
public static final String OUTPUT_OPTION = "output";
public static final String FILTER_OPTION = "filter";
public static final String TYPE_OPTION = "type";
public static final String MAPPER_OPTION = "mapper";
public static final String REDUCER_OPTION = "reducer";
public static final int DEFAULT_NUMBER_OF_MAPPERS = 1;
public static final int DEFAULT_NUMBER_OF_REDUCERS = 1;
public static final String DEFAULT_TYPE = "news";
}
另一个就是主程序了
package wordCount;
import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
public class Count extends Configured implements Tool {
static final Logger sLogger = Logger.getLogger(Count.class);
private static final String OPTION = "user_news";
private static enum MyCounter {
TOKEN, UNIQUE_TOKEN
}
public static class CountMapper extends MapReduceBase implements
Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value,
OutputCollector<Text, IntWritable> output, Reporter reporter)
throws IOException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
output.collect(word, one);
reporter.incrCounter(MyCounter.TOKEN, 1);
}
}
}
public static class CountReducer extends MapReduceBase implements
Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterator<IntWritable> values,
OutputCollector<Text, IntWritable> output, Reporter reporter)
throws IOException {
int sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
output.collect(key, new IntWritable(sum));
reporter.incrCounter(MyCounter.UNIQUE_TOKEN, 1);
}
}
public void count(String inputPath, String outputPath, String filterFile,
String user_news, int numberOfMappers, int numberOfReducers)
throws Exception {
Path confPath = null;
if (filterFile != null && filterFile.length() > 0) {
confPath = new Path(filterFile);
}
sLogger.info("Tool: " + Count.class.getSimpleName());
sLogger.info(" - input path: " + inputPath);
sLogger.info(" - output path: " + outputPath);
sLogger.info(" - number of mappers: " + numberOfMappers);
sLogger.info(" - number of reducers: " + numberOfReducers);
sLogger.info(" - filter path: " + filterFile);
sLogger.info(" - type: " + user_news);
JobConf conf = new JobConf(Count.class);
conf.setJobName("FZX " + Count.class.getSimpleName());
conf.set(OPTION, user_news);
// Preconditions.checkArgument(fs.exists(confPath),
// "Missing term index files...");
if (confPath != null) {
DistributedCache.addCacheFile(confPath.toUri(), conf);
}
conf.setNumMapTasks(numberOfMappers);
conf.setNumReduceTasks(numberOfReducers);
conf.setMapperClass(CountMapper.class);
conf.setReducerClass(CountReducer.class);
conf.setMapOutputKeyClass(Text.class);
conf.setMapOutputValueClass(IntWritable.class);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
//conf.setInputFormat(DeprecatedLzoTextInputFormat.class);
// conf.setInputFormat(LzoTextInputFormat.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(inputPath));
FileOutputFormat.setOutputPath(conf, new Path(outputPath));
// FileOutputFormat.setCompressOutput(conf, true);
long startTime = System.currentTimeMillis();
RunningJob job = JobClient.runJob(conf);
sLogger.info("Job Finished in "
+ (System.currentTimeMillis() - startTime) / 1000.0
+ " seconds");
}
public int run(String[] args) throws Exception {
Options options = new Options();
options.addOption(Settings.HELP_OPTION, false, "print the help message");
options.addOption(OptionBuilder.withArgName(Settings.PATH_INDICATOR)
.hasArg().withDescription("input file(s) or directory")
.isRequired().create(Settings.INPUT_OPTION));
options.addOption(OptionBuilder.withArgName(Settings.PATH_INDICATOR)
.hasArg().withDescription("filter file(s)")
.create(Settings.FILTER_OPTION));
options.addOption(OptionBuilder.withArgName(Settings.PATH_INDICATOR)
.hasArg().withDescription("output directory").isRequired()
.create(Settings.OUTPUT_OPTION));
options.addOption(OptionBuilder
.withArgName(Settings.INTEGER_INDICATOR)
.hasArg()
.withDescription(
"number of mappers (default - "
+ Settings.DEFAULT_NUMBER_OF_MAPPERS + ")")
.create(Settings.MAPPER_OPTION));
options.addOption(OptionBuilder
.withArgName(Settings.INTEGER_INDICATOR)
.hasArg()
.withDescription(
"number of reducers (default - "
+ Settings.DEFAULT_NUMBER_OF_REDUCERS + ")")
.create(Settings.REDUCER_OPTION));
options.addOption(OptionBuilder
.withArgName(Settings.STRING_INDICATOR)
.hasArg()
.withDescription(
"type (default - " + Settings.DEFAULT_TYPE + ")")
.create(Settings.TYPE_OPTION));
String inputPath = null;
String outputPath = null;
String filterPath = null;
String type = Settings.DEFAULT_TYPE;
int numberOfMappers = Settings.DEFAULT_NUMBER_OF_MAPPERS;
int numberOfReducers = Settings.DEFAULT_NUMBER_OF_REDUCERS;
CommandLineParser parser = new GnuParser();
HelpFormatter formatter = new HelpFormatter();
try {
CommandLine line = parser.parse(options, args);
if (line.hasOption(Settings.INPUT_OPTION)) {
inputPath = line.getOptionValue(Settings.INPUT_OPTION);
} else {
throw new ParseException("Parsing failed due to "
+ Settings.INPUT_OPTION + " not initialized...");
}
if (line.hasOption(Settings.OUTPUT_OPTION)) {
outputPath = line.getOptionValue(Settings.OUTPUT_OPTION);
} else {
throw new ParseException("Parsing failed due to "
+ Settings.OUTPUT_OPTION + " not initialized...");
}
if (line.hasOption(Settings.FILTER_OPTION)) {
filterPath = line.getOptionValue(Settings.FILTER_OPTION);
}
if (line.hasOption(Settings.MAPPER_OPTION)) {
numberOfMappers = Integer.parseInt(line
.getOptionValue(Settings.MAPPER_OPTION));
}
if (line.hasOption(Settings.REDUCER_OPTION)) {
numberOfReducers = Integer.parseInt(line
.getOptionValue(Settings.REDUCER_OPTION));
}
if (line.hasOption(Settings.TYPE_OPTION)) {
type = line.getOptionValue(Settings.TYPE_OPTION);
}
} catch (ParseException pe) {
System.err.println(pe.getMessage());
formatter.printHelp(Count.class.getName(), options);
System.exit(0);
} catch (NumberFormatException nfe) {
System.err.println(nfe.getMessage());
System.exit(0);
}
// Delete the output directory if it exists already
FileSystem fs = FileSystem.get(new JobConf(Count.class));
fs.delete(new Path(outputPath), true);
try {
count(inputPath, outputPath, filterPath, type, numberOfMappers,
numberOfReducers);
} finally {
// fs.delete(new Path(outputPath), true);
}
return 0;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new Count(), args);
System.exit(res);
}
}
其中CountMapper执行map操作,CountReducer执行reduce操作。