MapReduce第一个程序之WordCount

步骤

1)创建wordcount类继承configured,实现tool接口
2)实现mapper内部类
3)实现reducer内部类
4)设置Job相关信息
5)提交job运行

代码实现


package com.hainiuxy;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
 * 统计文本文件单词的个数
 */
public class WordCount extends Configured implements Tool{
/*	
 * ********输入类型的确定,跟输入的Format类有关系*******
 * public class TextInputFormat extends FileInputFormat<LongWritable, Text> 
 * 有个 createRecordReader() 返回值类型是 return new LineRecordReader(recordDelimiterBytes);
 * 
 * public class LineRecordReader extends RecordReader<LongWritable, Text>
 * 
 * keyin:LongWritable
 * valuein:Text
 * 
	对于文本来说,
	keyIn: 行字节的偏移量, long类型,封装类是:LongWritable
	one world
	one dream
	kyein	valuein
	0		one world
	10		one dream
	
	VALUEIN:一行数据 , String类型,封装类是:Text
	
	*****下面两个类型的确定是跟业务有关系
	KEYOUT:单词, Text
	
	VALUEOUT:数量, LongWritable
	
	*/
	public static class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
		
		/**
		 * map输出的key: 单词
		 */
		Text keyOut = new Text();
		
		
		/**
		 * map 输出 的value : 数值
		 * 封装数值
		 */
		LongWritable valueOut = new LongWritable(1L);
		
		//map():每行调用一次
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			System.out.println("------------------------------");
			//one world
			String line = value.toString();
			System.out.println("map input:" + key.get() + ", " + line);
			//[one, world]
			String[] splits = line.split(" ");
			
			for(String word : splits){
				//封装单词
				keyOut.set(word);
				
				//输出 <单词,1>的形式
				context.write(keyOut, valueOut);
				
				System.out.println("map output:" + word + ", " + valueOut.get());
			}
			
		}
	}
	
	/*
	 * keyIn, VALUEIN:map输出什么类型,reduce就输入什么类型
	 * 
	 * KEYOUT,VALUEOUT:是跟业务有关系的
	 * KEYOUT:单词, Text
	 * VALUEOUT:数量, LongWritable
	 */
	public static class WordCountReducer extends Reducer<Text, LongWritable, Text, LongWritable>{
		

		/**
		 * reduce输出的value:最终单词统计的结果
		 * 封装数值
		 */
		LongWritable valueOut = new LongWritable();
		
		//reduce():一个key调用一次
		@Override
		//在这里,reduce步的输入相当于<单词,valuelist>,如<Hello,<1,1>>
		protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
			
			System.out.println("-------------------------");
			//key:one
			//values:[1,1,1]
			//用户累加
			long sum = 0L;
			StringBuilder sb = new StringBuilder();
			//one,[1,1,1]
			sb.append("reduce input:" + key.toString() + ", [");
			for(LongWritable w : values){
				sb.append(w.get()).append(",");
				sum += w.get();
			}
			
			sb.deleteCharAt(sb.length() - 1).append("]");
			System.out.println(sb.toString());
			
			//valueOut.set(sum);
			//输出统计结果
			context.write(key, new LongWritable(sum));
			
			System.out.println("reduce output:" + key.toString() + ", " + valueOut.get());
			
		}
		
	}
	

	@Override
	public int run(String[] args) throws Exception {
		//获取Configuration对象
		Configuration conf = getConf();
		//创建job对象和job的名字wordcount
		Job job = Job.getInstance(conf, "wordcount");
		//设置job参数
		
		//设置job运行类
		job.setJarByClass(WordCount.class);
		//设置任务mapper运行类
		job.setMapperClass(WordCountMapper.class);
		//设置任务reducer运行类
		job.setReducerClass(WordCountReducer.class);
		
//		【默认就一个reduce】如果默认,不需要设置,只有reduce个数!=1时设置,
//		设置2个reduce
		job.setNumReduceTasks(2);
		
		//设置任务mapper输出的key的类型
		job.setMapOutputKeyClass(Text.class);
		//设置任务mapper输出的value的类型
		job.setMapOutputValueClass(LongWritable.class);
		
		//设置最终输出的key类型
		job.setOutputKeyClass(Text.class);
		//设置最终输出的value类型
		job.setOutputValueClass(LongWritable.class);
		
//		设置输入的格式:【默认是TextInputFormat.class】如果是文本,可以不写;如果是其他的就必须设置此项
		job.setInputFormatClass(TextInputFormat.class); 
		
//		【默认是TextOutputFormat.class】如果是文本,可以不写;如果是其他的就必须设置此项		
		job.setOutputFormatClass(TextOutputFormat.class);
		
		
		//设置任务的输入目录
		FileInputFormat.addInputPath(job, new Path(args[0]));
		
		//输出目录Path对象
		Path outputDir = new Path(args[1]);
		//设置任务的输出目录
		FileOutputFormat.setOutputPath(job, outputDir);
		
		//自动删除输出目录
		FileSystem fs = FileSystem.get(conf);
		if(fs.exists(outputDir)){
			//递归删目录
			fs.delete(outputDir, true);
			System.out.println("output dir【" + outputDir.toString() + "】 is deleted");
		}
		
		//运行job任务, 阻塞的方法
		//boolean status = job.waitForCompletion(true);
		
		//执行job
		return (job.waitForCompletion(true)) ? 0 : 1;
		
	}
	
	public static void main(String[] args) throws Exception {
		//  /tmp/mr/input /tmp/mr/output
		System.exit(ToolRunner.run(new WordCount(), args));
	}

}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值