第一个MapReduce程序

第一个程序一般都是Hello World,所以说MapReduce的第一个程序就是单词计数,主要代码如下:

package Temperature;

import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
 
public class WordCount {
	
	/***
	 * 
	 * 当向MapReduce提交作业的时候,首先文件会被分割成splits,由于我们只是测试
	 * 所以,只有一个split,然后MapReduce按行将文件切分,<key,value>相当于Python的字典
	 *
	 */
	/***
	 * 
	 * 将上边切割好的<key,value>传递给一下我们自定义的map
	 * 生成<key, value>
	 * 上边是按行分的文件数据,这里是按照空格分的行数据
	 *
	 */
	public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
		private final static IntWritable one = new IntWritable(1);
		private Text word = new Text();
		/**
		 * hadoop === java
		 * BooleanWritable === boolean
		 * ByteWritable  === byte
		 * ShortWritable === short 
		 * LongWritable === long
		 * Text === String
		 * IntWritable === int
		 * FloatWritable === float
		 * DoubleWritable === double
		 * ArrayWritable === Array
		 * MapWritable === map
		 */
		public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter)
				throws IOException {
			String line = value.toString();
			StringTokenizer tokenizer = new StringTokenizer(line);
			while (tokenizer.hasMoreTokens()) {
				word.set(tokenizer.nextToken());
				output.collect(this.word, this.one);
			} 
		}
	}
	/**
	 * 
	 * 得到<key,value>的值后Mapper会按照key对其进行排序,
	 * 如果定义了Combine函数,将会对这些排序后的相同的键值进行合并,以后再解析Combine函数,这里先不做解释
	 * Mapper将<key,value>交给Reducer
	 * Reduce端首先把收到的数据进行排序,生成<key,[values]>
	 * 然后交给下面我们自定义的reduce函数处理,最后生成<key,value>键值对输出到hdfs。
	 *
	 */
	public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
		public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output,
				Reporter report) throws IOException {
			int sum = 0;
			while (values.hasNext()) {
				sum += values.next().get();
			}
			output.collect(key, new IntWritable(sum));
		}
	}
	
	public static void main(String [] args) throws Exception {
		JobConf conf = new JobConf(WordCount.class);
		conf.setJobName("wordcount");
		
		// 配置输出Key和Value的类型
		conf.setOutputKeyClass(Text.class);
		conf.setOutputValueClass(IntWritable.class);
		//配置Map和Reduce类
		conf.setMapperClass(Map.class);
		conf.setReducerClass(Reduce.class);
		//配置输入输出类
		conf.setInputFormat(TextInputFormat.class);
		conf.setOutputFormat(TextOutputFormat.class);
		//设置输入输出路径
		FileInputFormat.setInputPaths(conf, new Path("hdfs://192.168.1.51:9000/input/qixiang_data"));
		FileOutputFormat.setOutputPath(conf, new Path("hdfs://192.168.1.51:9000/output/lzh/3"));
		//提交作业
		JobClient.runJob(conf);
		
	}

}

阅读更多
版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/Li_and_Li/article/details/80323197
个人分类: 大数据学习
想对作者说点什么? 我来说一句

没有更多推荐了,返回首页

关闭
关闭
关闭