带参数的MapReduce程序

最新推荐文章于 2021-08-12 00:07:25 发布

Felven

最新推荐文章于 2021-08-12 00:07:25 发布

阅读量1.5k

点赞数

分类专栏： Felven在职场文章标签： mapreduce hadoop

本文为博主原创文章，未经博主允许不得转载。

本文链接：https://blog.csdn.net/zhaoxinfan/article/details/21633151

版权

Felven在职场专栏收录该内容

257 篇文章 155 订阅

订阅专栏

在公司同事的帮助下写了一个word count程序，与hadoop官网上面的相比就是带了几个参数，使用时用户可以设定参数，并且还可以实时显示处理的进度（用token的形式反映）。

总共就两个类，一个是配置接口Setting

package wordCount;

public interface Settings {

	public static final String HELP_OPTION = "help";
	public static final String PATH_INDICATOR = "path";
	public static final String STRING_INDICATOR = "string";
	public static final String INTEGER_INDICATOR = "int";

	public static final String INPUT_OPTION = "input";
	public static final String OUTPUT_OPTION = "output";
	public static final String FILTER_OPTION = "filter";
	public static final String TYPE_OPTION = "type";
	
	public static final String MAPPER_OPTION = "mapper";
	public static final String REDUCER_OPTION = "reducer";

	public static final int DEFAULT_NUMBER_OF_MAPPERS = 1;
	public static final int DEFAULT_NUMBER_OF_REDUCERS = 1;
	public static final String DEFAULT_TYPE = "news";
}

另一个就是主程序了

package wordCount;

import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;

public class Count extends Configured implements Tool {
	static final Logger sLogger = Logger.getLogger(Count.class);

	private static final String OPTION = "user_news";

	private static enum MyCounter {
		TOKEN, UNIQUE_TOKEN
	}

	public static class CountMapper extends MapReduceBase implements
			Mapper<LongWritable, Text, Text, IntWritable> {
		private final static IntWritable one = new IntWritable(1);
		private Text word = new Text();

		public void map(LongWritable key, Text value,
				OutputCollector<Text, IntWritable> output, Reporter reporter)
				throws IOException {
			String line = value.toString();
			StringTokenizer tokenizer = new StringTokenizer(line);
			while (tokenizer.hasMoreTokens()) {
				word.set(tokenizer.nextToken());
				output.collect(word, one);
				reporter.incrCounter(MyCounter.TOKEN, 1);
			}
		}
	}

	public static class CountReducer extends MapReduceBase implements
			Reducer<Text, IntWritable, Text, IntWritable> {
		public void reduce(Text key, Iterator<IntWritable> values,
				OutputCollector<Text, IntWritable> output, Reporter reporter)
				throws IOException {
			int sum = 0;
			while (values.hasNext()) {
				sum += values.next().get();
			}
			output.collect(key, new IntWritable(sum));
			reporter.incrCounter(MyCounter.UNIQUE_TOKEN, 1);
		}
	}

	public void count(String inputPath, String outputPath, String filterFile,
			String user_news, int numberOfMappers, int numberOfReducers)
			throws Exception {

		Path confPath = null;
		if (filterFile != null && filterFile.length() > 0) {
			confPath = new Path(filterFile);
		}
		sLogger.info("Tool: " + Count.class.getSimpleName());
		sLogger.info(" - input path: " + inputPath);
		sLogger.info(" - output path: " + outputPath);
		sLogger.info(" - number of mappers: " + numberOfMappers);
		sLogger.info(" - number of reducers: " + numberOfReducers);
		sLogger.info(" - filter path: " + filterFile);
		sLogger.info(" - type: " + user_news);

		JobConf conf = new JobConf(Count.class);
		conf.setJobName("FZX " + Count.class.getSimpleName());
		conf.set(OPTION, user_news);

		// Preconditions.checkArgument(fs.exists(confPath),
		// "Missing term index files...");
		if (confPath != null) {
			DistributedCache.addCacheFile(confPath.toUri(), conf);
		}

		conf.setNumMapTasks(numberOfMappers);
		conf.setNumReduceTasks(numberOfReducers);

		conf.setMapperClass(CountMapper.class);
		conf.setReducerClass(CountReducer.class);


		conf.setMapOutputKeyClass(Text.class);
		conf.setMapOutputValueClass(IntWritable.class);
		conf.setOutputKeyClass(Text.class);
		conf.setOutputValueClass(IntWritable.class);

		//conf.setInputFormat(DeprecatedLzoTextInputFormat.class);
		// conf.setInputFormat(LzoTextInputFormat.class);
		conf.setInputFormat(TextInputFormat.class);
		conf.setOutputFormat(TextOutputFormat.class);

		FileInputFormat.setInputPaths(conf, new Path(inputPath));
		FileOutputFormat.setOutputPath(conf, new Path(outputPath));
		// FileOutputFormat.setCompressOutput(conf, true);

		long startTime = System.currentTimeMillis();
		RunningJob job = JobClient.runJob(conf);
		sLogger.info("Job Finished in "
				+ (System.currentTimeMillis() - startTime) / 1000.0
				+ " seconds");
	}

	public int run(String[] args) throws Exception {
		Options options = new Options();

		options.addOption(Settings.HELP_OPTION, false, "print the help message");
		options.addOption(OptionBuilder.withArgName(Settings.PATH_INDICATOR)
				.hasArg().withDescription("input file(s) or directory")
				.isRequired().create(Settings.INPUT_OPTION));
		options.addOption(OptionBuilder.withArgName(Settings.PATH_INDICATOR)
				.hasArg().withDescription("filter file(s)")
				.create(Settings.FILTER_OPTION));
		options.addOption(OptionBuilder.withArgName(Settings.PATH_INDICATOR)
				.hasArg().withDescription("output directory").isRequired()
				.create(Settings.OUTPUT_OPTION));
		options.addOption(OptionBuilder
				.withArgName(Settings.INTEGER_INDICATOR)
				.hasArg()
				.withDescription(
						"number of mappers (default - "
								+ Settings.DEFAULT_NUMBER_OF_MAPPERS + ")")
				.create(Settings.MAPPER_OPTION));
		options.addOption(OptionBuilder
				.withArgName(Settings.INTEGER_INDICATOR)
				.hasArg()
				.withDescription(
						"number of reducers (default - "
								+ Settings.DEFAULT_NUMBER_OF_REDUCERS + ")")
				.create(Settings.REDUCER_OPTION));

		options.addOption(OptionBuilder
				.withArgName(Settings.STRING_INDICATOR)
				.hasArg()
				.withDescription(
						"type (default - " + Settings.DEFAULT_TYPE + ")")
				.create(Settings.TYPE_OPTION));

		String inputPath = null;
		String outputPath = null;
		String filterPath = null;
		String type = Settings.DEFAULT_TYPE;
		int numberOfMappers = Settings.DEFAULT_NUMBER_OF_MAPPERS;
		int numberOfReducers = Settings.DEFAULT_NUMBER_OF_REDUCERS;

		CommandLineParser parser = new GnuParser();
		HelpFormatter formatter = new HelpFormatter();
		try {
			CommandLine line = parser.parse(options, args);

			if (line.hasOption(Settings.INPUT_OPTION)) {
				inputPath = line.getOptionValue(Settings.INPUT_OPTION);
			} else {
				throw new ParseException("Parsing failed due to "
						+ Settings.INPUT_OPTION + " not initialized...");
			}

			if (line.hasOption(Settings.OUTPUT_OPTION)) {
				outputPath = line.getOptionValue(Settings.OUTPUT_OPTION);
			} else {
				throw new ParseException("Parsing failed due to "
						+ Settings.OUTPUT_OPTION + " not initialized...");
			}

			if (line.hasOption(Settings.FILTER_OPTION)) {
				filterPath = line.getOptionValue(Settings.FILTER_OPTION);
			}

			if (line.hasOption(Settings.MAPPER_OPTION)) {
				numberOfMappers = Integer.parseInt(line
						.getOptionValue(Settings.MAPPER_OPTION));
			}

			if (line.hasOption(Settings.REDUCER_OPTION)) {
				numberOfReducers = Integer.parseInt(line
						.getOptionValue(Settings.REDUCER_OPTION));
			}

			if (line.hasOption(Settings.TYPE_OPTION)) {
				type = line.getOptionValue(Settings.TYPE_OPTION);
			}
		} catch (ParseException pe) {
			System.err.println(pe.getMessage());
			formatter.printHelp(Count.class.getName(), options);
			System.exit(0);
		} catch (NumberFormatException nfe) {
			System.err.println(nfe.getMessage());
			System.exit(0);
		}

		// Delete the output directory if it exists already
		FileSystem fs = FileSystem.get(new JobConf(Count.class));
		fs.delete(new Path(outputPath), true);

		try {
			count(inputPath, outputPath, filterPath, type, numberOfMappers,
					numberOfReducers);
		} finally {
			// fs.delete(new Path(outputPath), true);
		}
		return 0;
	}

	public static void main(String[] args) throws Exception {
		int res = ToolRunner.run(new Configuration(), new Count(), args);
		System.exit(res);
	}
}

其中CountMapper执行map操作，CountReducer执行reduce操作。