大数据笔记07--MR案例开发

最新推荐文章于 2023-12-26 11:55:06 发布

KissshotAHUB

最新推荐文章于 2023-12-26 11:55:06 发布

阅读量307

点赞数 1

分类专栏：大数据笔记

本文链接：https://blog.csdn.net/KissshotAHUB/article/details/83150939

版权

大数据笔记专栏收录该内容

10 篇文章 0 订阅

订阅专栏

三个案例

wordcount
温度统计
推荐好友

wordcount

案例需求

统计输入的文件中，每个单词出现了几次

分析设计

在map中将输入的每条数据切割成单词，将key为单词，value为1的计算结果输出
默认的分组器会将相同key（单词）的数据分为一组，输入reduce
在reduce中，遍历输入的数据，将value加和（sum），输出单词和sum到文件中

代码

主类

public class MyWC {
	public static void main(String[] args) throws Exception {
		//获取conf对象
		Configuration conf = new Configuration(true);
		//获取任务对象job对
		Job job = Job.getInstance(conf);
		//设置当前main函数所在类
		job.setJarByClass(MyWC.class);
		//设置当前应用的jar包所在路径
		job.setJar("");
		
		//设置输入路径
		FileInputFormat.setInputPaths(job, "/test/");
		//创建输出路径的对象
		Path outPath = new Path("/output");
		//若输出路径存在 则删除
		FileSystem fs = outPath.getFileSystem(conf);
		if(fs.exists(outPath)){
			fs.delete(outPath,true);
		}
		//设置输出路径
		FileOutputFormat.setOutputPath(job, outPath);
		
		//设置Map的Class
		job.setMapperClass(MyWCMapper.class);
		//设置Map输出的key、value类型
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		
		//设置reduce的class
		job.setReducerClass(MyWCReducer.class);
		
		//设置reduce的个数
		job.setNumReduceTasks(2);
		
		//运行Job true表示返回运行信息
		job.waitForCompletion(true);
	}
}

public class MyWCMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
	//创建key和value的对象
	Text myKey = new Text();
	IntWritable myValue = new IntWritable(1);
	/*
	 * 重写map方法
	 */
	@Override
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		
		//用hadoop提供的StringUtils类的split方法切割字符串
		String[] words = StringUtils.split(value.toString(),' ');
		//遍历输出
		for(String word : words){
			myKey.set(word);
			context.write(myKey, myValue);
		}
	}
}

Reduce

public class MyWCReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
	/**
	 * 重写reduce方法
	 */
	@Override
	protected void reduce(Text key, Iterable<IntWritable> values, Context context) 
			throws IOException, InterruptedException {

		int sum = 0;
		/**
		 * 一组数据调用一次reduce
		 * 所以和map一条条读kv对数据不同，reduce读入一组数据，所以是values
		 * key是一个指针，指向对应value的key值（此处因为是由默认分组器分组，所以一次读入的values的key相同）
		 */
		for(IntWritable value : values){
			sum += value.get();
		}
		context.write(key, new IntWritable(sum));
	}
}

温度统计

案例需求

对下例所示的温度数据，筛选出每个月温度最高的两天

	1949-10-01 14:21:02	34c
	1949-10-01 19:21:02	38c
	1949-10-02 14:01:02	36c
	1950-01-01 11:21:02	32c
	1950-10-01 12:21:02	37c
	1951-12-01 12:21:02	23c
	1950-10-02 12:21:02	41c
	1950-10-03 12:21:02	27c
	1951-07-01 12:21:02	45c
	1951-07-02 12:21:02	46c
	1951-07-03 12:21:03	47c

分析设计

在map中将切割信息，舍弃时间，将年月日和温度封装到自定义对象中作为key，value为null
自定义分组器将年和月相同的数据分到一组
在reduce中，遍历key（已根据年、月、温度排序），输出前两条日期不重复的数据，即温度最高的两天

代码

自定义对象

public class MyTQ implements WritableComparable<MyTQ>{
	
	private int year;
	private int month;
	private int day;
	private double temp;
	
	/**
	*Getter和 Setter省略
	*/
	
	//toString方法，输出数据时会调用
	@Override
	public String toString() {
		return year + "-" + month + "-" + day + ":" + temp;
	}
	//序列化方法
	@Override
	public void write(DataOutput out) throws IOException {
		 out.writeInt(year);
		 out.writeInt(month);
		 out.writeInt(day);
		 out.writeDouble(temp);
	}
	//反序列化
	@Override
	public void readFields(DataInput in) throws IOException {
		this.year = in.readInt();
		this.month = in.readInt();
		this.day = in.readInt();
		this.temp = in.readDouble();
	}
	
	//重写compareTo方法
	//完成三次排序
	@Override
	public int compareTo(MyTQ o) {
		//先比较年份
		int yc = Integer.compare(this.year, o.getYear());
		if(yc == 0){ //若年份相同
			//再比较月份
			int mc = Integer.compare(this.month, o.getMonth());
			//若两个mouth相等，直接返回温度的比较结果
			if(mc == 0){
				return Double.compare(this.temp, o.getTemp());
			}else{
				//month不相等，返回month的比较结果
				return mc;
			}
		}else{
			//若两个year不相同，返回year的结果
			return yc;
		}
		/**
		 * 直接返回yc、mc、this-o
		 * this的值小于o的值返回负数，大于返回正数
		 * 最终排序的结果是升序
		 * 若改为o-this 排序结果是降序
		 */
	}
	
}

主类

public class MyTQMR {
	public static void main(String[] args) throws Exception {
		//创建配置对象
		 Configuration conf = new Configuration(true);
		 //创建job对象
		 Job job = Job.getInstance(conf);
		 job.setJarByClass(MyTQMR.class);

		 //输入路径
		 FileInputFormat.setInputPaths(job, "");
		 //输出路径
		 Path path = new Path("");
		 FileSystem fileSystem = path.getFileSystem(conf);
		 if(fileSystem.exists(path)){
			 fileSystem.delete(path,true);
		 }
		 FileOutputFormat.setOutputPath(job, path);
		 
		 //设置map和输出的kv类型
		 job.setMapperClass(MyTQMapper.class);
		 job.setMapOutputKeyClass(MyTQ.class);
		 job.setMapOutputValueClass(NullWritable.class);
		 
		 //设置reduce类
		 job.setReducerClass(MyTQReducer.class);
		 
		 //设置自定义的分组器
		 job.setGroupingComparatorClass(MyTQGroupComparator.class);
		 
		 //运行任务
		 job.waitForCompletion(true);
	}
}

public class MyTQMapper extends Mapper<LongWritable, Text, MyTQ, NullWritable> {
	
	MyTQ myKey = new MyTQ();

	@Override
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		//根据‘\t’切割出温度
		String[] split1 = StringUtils.split(value.toString(), '\t');
		Double temp = Double.parseDouble(split1[1].split("c")[0]);//去除后面的c
		//根据‘ ’切割时间和日期
		String[] split2 = StringUtils.split(split1[1], ' ');
		//舍弃时间(split2[1])
		//切割年月日
		String[] split3 = StringUtils.split(split2[0], '-');
		int year = Integer.parseInt(split3[0]);
		int month = Integer.parseInt(split3[1]);
		int day = Integer.parseInt(split3[2]);
		
		//将值设置给对象
		myKey.setYear(year);
		myKey.setMonth(month);
		myKey.setDay(day);
		myKey.setTemp(temp);
		
		//写入下一步
		context.write(myKey, NullWritable.get());
	}
}

Reduce

public class MyTQReducer extends Reducer<MyTQ, NullWritable, MyTQ, NullWritable> {
	@Override
	protected void reduce(MyTQ key, Iterable<NullWritable> values,
			Context context) throws IOException, InterruptedException {
		int flag = 0;//记录读取的记录数
		int day = 0;//记录已读的日期
		
		//该逻辑仅适用于读取两条数据
		for (NullWritable value : values){
			//读第一条数据时 flag为0 读第二条数据时不进入这段代码
			if(flag == 0){
				//将结果输出，并将日期赋给day
				context.write(key, value);
				day = key.getDay();
			}
			//读第二条数据时 若日期相等则跳过 读下一条 不相等时才进入这段代码
			if(flag != 0 && day != key.getDay()){
				//将第二条数据输出 并跳出循环
				context.write(key, value);
				break;
			}
			flag++;
		}
	}
}

自定义的分组器

public class MyTQGroupComparator extends WritableComparator {
	/**
	 * 无参构造方法调用父类构造方法
	 */
	public MyTQGroupComparator() {
		super(MyTQ.class,true);
	}
	
	//重写compare(WritableComparable a, WritableComparable b)方法
	@Override
	public int compare(WritableComparable a, WritableComparable b) {
		//获取需要比较的值 的反序列化后的对象
		MyTQ t1 = (MyTQ) a;
		MyTQ t2 = (MyTQ) b;
		
		//用Integer的compare方法比较year和month
		int yc = Integer.compare(t1.getYear(), t2.getYear());
		int mc = Integer.compare(t1.getMonth(), t2.getMonth());
		/*
		 * 返回值仅用于两种状态的判断 
		 * 为0时 分在同一组（即年月相同）
		 * 不为0时 不分在同一组
		 */
		if(yc == 0){
			return mc;
		}else{
			return yc;
		}
	}
}

涉及到的类

NullWritable

和IntWritable、Text等类似是对数据类型进行可序列化封装的封装类，但NullWritable类似一个占位符，用于value值需要为空的情况，不会输出到文件中，也能避免因map中输出value为null而造成之后出现空指针错误
无法实例化（构造方法为private），通过静态方法get()获取实例化对象，详见源码

WritableComparable<T>接口

自定义对象时需要实现该接口，重写序列化方法write(DataOutput out)、反序列化方法readFields(DataInput in)、排序方法int CompareTo(MyTQ o)

WritableComparator类

自定义分组器时对key分组，需要继承WritableComparator类（下称WC），重写无参构造方法调用父类的构造方法，传入参数自定义对象MyTQ.class和参数true
WC中有两个WritableComparable接口（即MyTQ的父接口）的指针key1，key2，和DataInputBuffer类（继承DataInputStream，DataInputStream实现DataInput接口）的对象buffer
调用分组器时，构造方法将反射创建MyTQ对象赋给key1，key2指针（传入的true此时生效，若是false则不会创建对象，key1、key2赋为null），并将buffer实例化
之后调用WC中的compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2)方法，将byte数组b1的s1到l1位置的字节（即以字节的形式从文件中读取的一条数据）放入buffer，调用key1的readFields方法传入buffer，完成反序列化，将值赋给key1中各属性，对key2同样
赋值完后该compare方法会调用compare(key1,key2)，即自定义分组器中重写的方法
详见WritableComparator类源码