017 MapReduce的老版本API MapReduce中的参数传递 MapReduce中的压缩

最新推荐文章于 2022-07-29 00:16:56 发布

C_time

最新推荐文章于 2022-07-29 00:16:56 发布

阅读量210

点赞数

分类专栏：大数据文章标签： MapReduce的老版本API的使用 MapReduce中的参数传递 MapReduce中的压缩

本文链接：https://blog.csdn.net/C_time/article/details/90727696

版权

大数据专栏收录该内容

32 篇文章 0 订阅

订阅专栏

MapReduce的老版本API

找找不同（跟2.X比）
1.注意1.X导入的都是短包
*就是mapred的包
在这里插入图片描述
2.划线处不同
1.X 的Mapper 这个使用StringTokenizer来分割

2.X的Mapper

3. 1.X的Reducer 还有这个使用while循环

在这里插入图片描述
2.X的Reducer 还有这个使用for循环

运行数据用以前的de
运行

看运行文件 part-00000 没有-r

结果没问题

package qf.com.mr;

import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;



/*
*@author Shishuai E-mail：1198319583@qq.com
*@version Create time ： 2019年6月1日下午2:43:03
*类说明：hadoop 1.X的写法
*
*注意1.X导入的都是短包
*就是mapred的包
*/
public class OldApi {
	public static class MyMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text>{

		public void map(LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter)
				throws IOException {
			String line = value.toString();
			
			//StringTokenizer 这个类默认给你用空格或者\t来切分
			StringTokenizer st =  new StringTokenizer(line);
			while(st.hasMoreTokens()) {
				output.collect(new Text(st.nextToken()), new Text(1+""));
			}
			
		}
		
	}
	public static class MyReducer extends MapReduceBase implements Reducer<Text, Text, Text, Text>{

		public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter)
				throws IOException {
			int counter = 0;
			while(values.hasNext()) {
				counter += Integer.parseInt(values.next().toString());
			}
			output.collect(key, new Text(counter+""));
		}

		
	}
	public static void main(String[] args) {
		Configuration conf = new Configuration();
		
		try {
			JobConf job = new JobConf(conf);
			job.setJarByClass(OldApi.class);
			
			job.setMapperClass(MyMapper.class);
			job.setMapOutputKeyClass(Text.class);
			job.setMapOutputValueClass(Text.class);
			FileInputFormat.addInputPath(job, new Path(args[0]));
			
			job.setReducerClass(MyReducer.class);
			job.setOutputKeyClass(Text.class);
			job.setOutputValueClass(Text.class);
			FileOutputFormat.setOutputPath(job, new Path(args[1]));
			
			//提交
			JobClient jc = new JobClient(job);
			RunningJob rj = jc.runJob(job);
			int isok = rj.getJobState();
			if(isok == 2) {
				System.exit(0);
			}else {
				System.exit(1);
			}
			
		} catch (Exception e) {
		}
	}
}

MapReduce中的参数传递

一定要在设置Job之前设置参数传递

在job前边设置
在这里插入图片描述

在src下创建一个xml文件
在这里插入图片描述
格式参考截图里的文件

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
    <name>mapreduce.input.dir</name>
    <value>/de</value>
</property>
<property>
    <name>mapreduce.output.dir</name>
    <value>/out/00</value>
</property>
</configuration>

emmm
我把hdfs的out都清理了
重新上传了一个out空文件夹

运行传个参数 0824

yarn jar /home/wc.jar qf.com.mr.ParamDemo 0824

在这里插入图片描述
看看控制台这边运行的输出

	first param in map
		first param=0
	first param in reduce
		first param=0
	File Input Format Counters 
		Bytes Read=92
	File Output Format Counters 
		Bytes Written=71
	second param in map
		0824=0
	second param in reduce
		0824=0
	three param in map
		666=0
	three param in reduce
		666=0

在这里插入图片描述

这个也是有结果的没有问题
在这里插入图片描述
好好看看代码
是怎么传递的

代码是Module01模板复制过来的 map和reduce具体代码是MywordCount的代码
具体看看参数传递的部分
在这里插入图片描述
还有下面改动的地方

package qf.com.mr;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


/*
*@author Shishuai E-mail：1198319583@qq.com
*@version Create time ： 2019年5月28日下午5:42:34
*类说明：参数传递
....
*/
public class ParamDemo implements Tool{
	public static final int three = 666;
	
	
	/**
	 * map阶段
	 * @author HP
	 *
	 */
	public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
		public static Text k = new Text();
		public static IntWritable v = new IntWritable();
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			//1.从输入数据中获取每一个文件中的每一行的值
			String line = value.toString();
			//2.对每一行的数据进行切分（有的不用）
			String [] words = line.split(" ");
			//3.循环处理
			for (String word : words) {
				k.set(word);
				v.set(1);
				//map阶段的输出 context上下文环境变量
				context.write(k, v);//这个输出在循环里面 有一个输出一个
			}
			
			context.getCounter("first param in map", context.getConfiguration().get("param1"));
			context.getCounter("second param in map", context.getConfiguration().get("param2"));
			context.getCounter("three param in map", three + "");
		}
		
	}
	/**
	 * reduce阶段
	 */
	public static class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable>{

		@Override
		protected void reduce(Text key, Iterable<IntWritable> values, Context context)
				throws IOException, InterruptedException {
			//1.自定义一个计数器
			int counter = 0;
			for (IntWritable i : values) {
				counter += i.get();
			}
			//2.reduce阶段的最终输出
			context.write(key, new IntWritable(counter));
			//这个输出在循环外面 等统计完了这一个容器再输出
			
			context.getCounter("first param in reduce", context.getConfiguration().get("param1"));
			context.getCounter("second param in reduce", context.getConfiguration().get("param2"));
			context.getCounter("three param in reduce", three + "");
		}
		
	}
	
	public void setConf(Configuration conf) {
		// 对conf的属性设置
		conf.set("fs.defaultFS", "hdfs://qf");
		conf.set("dfs.nameservices", "qf");
		conf.set("dfs.ha.namenodes.qf", "nn1, nn2");
		conf.set("dfs.namenode.rpc-address.qf.nn1", "hadoop01:9000");
		conf.set("dfs.namenode.rpc-address.qf.nn2", "hadoop02:9000");
		conf.set("dfs.client.failover.proxy.provider.qf", "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider");
	}

	public Configuration getConf() {
		return new Configuration();
	}

	public int run(String[] args) throws Exception {
		// 1.获取配置对象信息
				Configuration conf = new Configuration();
				
				//读取配置文件
				conf.addResource(ParamDemo.class.getResourceAsStream("/Param.xml"));
				conf.set("param1", "first param");
				conf.set("param2", args[0]);
				
				
				
				
				
				// 3.获取job对象 （注意导入的包）
				Job job = Job.getInstance(conf, "job");
				// 4.设置job的运行主类
				job.setJarByClass(ParamDemo.class);
				
				//set inputpath and outputpath
				setInputAndOutput(job, conf, args);
				
				
				
				// System.out.println("jiazai finished");
				// 5.对map阶段进行设置
				job.setMapperClass(MyMapper.class);
				job.setMapOutputKeyClass(Text.class);
				job.setMapOutputValueClass(IntWritable.class);
	
				
				
				// System.out.println("map finished");
				// 6.对reduce阶段进行设置
				job.setReducerClass(MyReducer.class);
				job.setOutputKeyClass(Text.class);
				job.setOutputValueClass(IntWritable.class);
				
				
				return job.waitForCompletion(true) ? 0 : 1;
	}

	//主方法
		public static void main(String[] args) throws Exception {
			int isok = ToolRunner.run(new Configuration(), new ParamDemo(), args);
			System.out.println(isok);
		}

		/**
		 * 处理参数的方法
		 * @param job
		 * @param conf
		 * @param args
		 */
		
		private void setInputAndOutput(Job job, Configuration conf, String[] args) {
			if(args.length != 1) {
				System.out.println("usage:yarn jar /*.jar package.classname /* /*");
				return ;
			}
			//正常处理输入输出参数
			try {
				FileInputFormat.addInputPath(job, new Path(conf.get("mapreduce.input.dir")));
				
				FileSystem fs = FileSystem.get(conf);
				Path outputpath = new Path(conf.get("mapreduce.output.dir"));
				if(fs.exists(outputpath)) {
					fs.delete(outputpath, true);
				}
				FileOutputFormat.setOutputPath(job, outputpath);
			} catch (Exception e) {
				e.printStackTrace();
			}
			
		}

}

MapReduce中的压缩

代码我们使用上面的ParamDemo 命名为CompressDemo
删除参数传递的几行
然后再conf上进行设置
要在job之前设置

对了
具体内容使用notepad++连接hadoop 找到mapred-default.xml文件我们复制里面的参数
（一开始我以为再hadoop的文件里能找到这个文件即user local hadoop-2.7.1下的etc的hadoop的文件夹下有这个配置文件可是没找到视频也没有说直接跳过去了然后就百度百度也没有
气死就想直接照着视频敲不过太多了不想敲
继续百度）
看到一个帖子说是
在这里插入图片描述
那么我想是不是这个我要找的mapred-default.xml文件也在jar包里我就从eclipse的Maven找
功夫不负有心人找到了

在hadoop-mapreduce-client-core-2.7.1.jar包里面
哇开心
在这里插入图片描述
然后就一通复制
哦了

所以记住

default之类的配置文件不在hadoop配置文件夹下在jar包里

找到这里
在这里插入图片描述

设置Map的和Reduce的输出压缩
那么为什么不设置reduce的输出解压缩呢？能识别吗？能 MapReduce中自动寻找算法解压缩输入的内容
输入数据可以是压缩文件 mapreduce自动找解压的算法

运行
因为使用了上一个参数传递的代码所以要写个数无所谓我们主要看压缩的设置对不对

yarn jar /home/wc.jar qf.com.mr.CompressDemo 0824

运行完成
在这里插入图片描述

大小69
可能比不压缩还大
因为数据量过小
体现不出压缩的优势
这个还会有一些压缩算法被压缩进去所以看上去比较大

结果文件时.deflate 后缀使用cat你看没法查看结果是乱码我们使用-text查看
在这里插入图片描述
有结果结果是什么无所谓的

C_time

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
017 MapReduce的老版本API MapReduce中的参数传递 MapReduce中的压缩

MapReduce的老版本API
复制链接

扫一扫

专栏目录