MapReduce WordCount

最新推荐文章于 2022-08-29 10:32:59 发布

tbbm.

最新推荐文章于 2022-08-29 10:32:59 发布

阅读量112

点赞数

文章标签：小白 MapReduce

本文链接：https://blog.csdn.net/weixin_41791130/article/details/84139205

版权

简单实现 MapReduce Wrodcount

一：java代码

package com.beicai.am;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  

/*KYEIN  LongWritable 偏移量
 * VALUEIN Text 一行数据
 * KEYOUT Text 标识符 （唯一）
 * VALUEOUT  IntWritable 出现的次数 int类型的
 * 
 * 相比于java  int类型换成了 IntWritable   String 换成了 Text  --> 类比
 * */
//主类
public class Demo01WordCount {


    //mapper实现类
	public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
		IntWritable one = new IntWritable();

		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
				throws IOException, InterruptedException {
			// key 是偏移量
			// value是一行数据
			String lines = value.toString();
			String[] split = lines.split(" ");
			// hello wordl hello hi hi world
			for (String word : split) {
				// context 上下文对象 作为输出
				// context.write(new Text(word), new IntWritable(1));
				one.set(1);
				context.write(new Text(word), one);

			}
		}

	}
        //reducer实现类
	public static class MyRecuder extends Reducer<Text, IntWritable, Text, Text> {
			
			@Override
			protected void reduce(Text key, Iterable<IntWritable> value,
					Context context) throws IOException, InterruptedException {
				//key= 标识符
				//value= list(1,1,1,1,1) 
				//context 上下文对象 
				int count=0;
				for (IntWritable v : value) {
					//对list  遍历 累加
					 count+=v.get();
					 
				}
				//String.valueOf(count); 
				//context.write(key, new Text(count+""));
				context.write(key, new Text(count+""));
			}
		

	}
	//驱动类
	public static void main(String[] args) throws Exception {
		//获取配置对象
		Configuration conf=new Configuration();
		//获取job对象
		//Job job=new Job(conf,"");已过时
		Job job = Job.getInstance(conf,"Demo01WordCount");
		//设置运行jar
		job.setJarByClass(Demo01WordCount.class);
		//设置mapper类
		job.setMapperClass(MyMapper.class);
		job.setOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		//设置reducer类
		job.setReducerClass(MyRecuder.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		
		//设置输入输出
        //输入
		FileInputFormat.addInputPath(job, new Path(args[0]));
		//输出路径不能存在
		FileOutputFormat.setOutputPath(job,new Path(args[1]));
		
		int status = job.waitForCompletion(true)?0:-1;
		System.exit(status);
	}

}

二：把Demo01WrodCount打成jar包 -->(取名为：demo.jar)发送到linux中

我把demo.jar 放在 /usr/hello 下

三:

创建输入路径: hdfs dfs -mkdir -p /usr/hello/in

上传一个测试文件到in中 (dream.txt): hdfs dfs -put dream.txt /usr/hello/in

输入输出: hadoop jar ./deam.jar /usr/hello/in/dream.txt /usr/hello/in/101 //输出路径不能重复(存在)

Or: yarn jar ./deam.jar /usr/hello/in/dream.txt /usr/hello/in/101 //输出路径不能重复(存在)

50070查看 usr/hello/in/101/part-r-00000

linux查看: hdfs dfs -cat /usr/hello/in/101/part-r-00000 (hadoop fs -cat /usr/hello/in/101/part-r-00000)

oisk!

mapreduce原理: 思想:先拆分，再合并