MapReduce实现WordCount以及常见问题解决

最新推荐文章于 2023-12-23 12:48:18 发布

Tim_long

最新推荐文章于 2023-12-23 12:48:18 发布

阅读量2.8k

点赞数

分类专栏： big data 文章标签： big data mapreduce Word Count hadoop hadoop.dll

本文链接：https://blog.csdn.net/qq_38161676/article/details/79155042

版权

big data 专栏收录该内容

2 篇文章

订阅专栏

Hadoop环境开发环境测试

最近在做一个大数据方面的小工程，需要使用基于hadoop环境进行计算。MapReduce是Hadoop生态系统下的成熟的计算框架，在开发之前需要搭建MapReduce开发环境并测试。

由于我本身就是大数据方面的新手，因此在开始的时候遇到不小的麻烦，但在慢慢的寻找解决办法的过程中还是有所感悟的，因此有必要做一下记录，希望给读者有所帮助！

通常我们需要在搭好的hadoop环境下进行“wordcount”程序测试，好看是否能够正常运行。一开始一般是不能直接成功运行 WordCount程序的，因为在windows系统下，system32文件下缺少必要文件“winutils.exe”，另外在hadoop版本文件bin目

录下缺少文件“hadoop.dll”文件。大家可以在点击这里下载。

step1：向HDFS上传一个包含一以下内容的txt文件number.txt

对文件数据使用MapReduce编程：

package cn.edu.nuist.wordcount;

import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
/**
 * 
 * 描述：WordCount explains 
 * @author Hadoop Dev Group
 */
public class WordCount
{
	private static String inPath = "/countWords/number.txt";
	private static String outPath = "/countWords/output";
	private static String hdfs = "hdfs://10.255.248.61:9000";  //这里配置集群中master主机IP和端口号


	/**
	 * MapReduceBase类:实现了Mapper和Reducer接口的基类（其中的方法只是实现接口，而未作任何事情）
	 * Mapper接口：
	 * WritableComparable接口：实现WritableComparable的类可以相互比较。所有被用作key的类应该实现此接口。
	 * Reporter 则可用于报告整个应用的运行进度，本例中未使用。 
	 * 
	 */
	public static class Map extends MapReduceBase implements
	Mapper<LongWritable, Text, Text, IntWritable>
	{
		/**
		 * LongWritable, IntWritable, Text 均是 Hadoop 中实现的用于封装 Java 数据类型的类，这些类实现了WritableComparable接口，
		 * 都能够被串行化从而便于在分布式环境中进行数据交换，你可以将它们分别视为long,int,String 的替代品。
		 */
		private final static IntWritable one = new IntWritable(1);
		private Text word = new Text();

		/**
		 * Mapper接口中的map方法：
		 * void map(K1 key, V1 value, OutputCollector<K2,V2> output, Reporter reporter)
		 * 映射一个单个的输入k/v对到一个中间的k/v对
		 * 输出对不需要和输入对是相同的类型，输入对可以映射到0个或多个输出对。
		 * OutputCollector接口：收集Mapper和Reducer输出的<k,v>对。
		 * OutputCollector接口的collect(k, v)方法:增加一个(k,v)对到output
		 */
		public void map(LongWritable key, Text value,
				OutputCollector<Text, IntWritable> output, Reporter reporter)
						throws IOException
		{
			String line = value.toString();
			StringTokenizer tokenizer = new StringTokenizer(line);
			while (tokenizer.hasMoreTokens())
			{
				word.set(tokenizer.nextToken());
				output.collect(word, one);
			}
		}
	}
	public static class Reduce extends MapReduceBase implements
	Reducer<Text, IntWritable, Text, IntWritable>
	{
		public void reduce(Text key, Iterator<IntWritable> values,
				OutputCollector<Text, IntWritable> output, Reporter reporter)
						throws IOException
		{
			int sum = 0;
			while (values.hasNext())
			{
				sum += values.next().get();
			}
			output.collect(key, new IntWritable(sum));
		}
	}


	public static void main(String[] args) throws Exception
	{
		/**
		 * JobConf：map/reduce的job配置类，向hadoop框架描述map-reduce执行的工作
		 * 构造方法：JobConf()、JobConf(Class exampleClass)、JobConf(Configuration conf)等
		 */
		JobConf conf = new JobConf(WordCount.class);
		conf.setJobName("wordcount");          //设置一个用户定义的job名称

		conf.set("fs.default.name", hdfs);

		conf.setOutputKeyClass(Text.class);    //为job的输出数据设置Key类
		conf.setOutputValueClass(IntWritable.class);  //为job输出设置value类
		conf.setMapperClass(Map.class);        //为job设置Mapper类
		conf.setCombinerClass(Reduce.class);      //为job设置Combiner类
		conf.setReducerClass(Reduce.class);        //为job设置Reduce类
		conf.setInputFormat(TextInputFormat.class);    //为map-reduce任务设置InputFormat实现类
		conf.setOutputFormat(TextOutputFormat.class);  //为map-reduce任务设置OutputFormat实现类
		/**
		 * InputFormat描述map-reduce中对job的输入定义
		 * setInputPaths():为map-reduce job设置路径数组作为输入列表
		 * setInputPath()：为map-reduce job设置路径数组作为输出列表
		 */
		
		//设置输入输出文件路径     
		FileSystem fs = FileSystem.get(conf);
		
		Path inputPath = new Path(inPath);
		if(fs.exists(inputPath)) {
		FileInputFormat.setInputPaths(conf, inputPath);
		}
		
		Path outputPath = new Path(outPath);
		fs.delete(outputPath, true);
		
		
		FileOutputFormat.setOutputPath(conf, outputPath);
		
		
		JobClient.runJob(conf);        //运行一个job
	}
}

运行后refreshHDFS文件系统，可以得到输出文件的信息：