MapReduce自定义输入格式和自定义输出格式

最新推荐文章于 2023-12-11 19:45:00 发布

ygpGoogle

最新推荐文章于 2023-12-11 19:45:00 发布

阅读量1.6k

点赞数

分类专栏： hadoop 文章标签： hadoop mapreduce 大数据

本文链接：https://blog.csdn.net/ygp12345/article/details/109333205

版权

hadoop 专栏收录该内容

20 篇文章 2 订阅

订阅专栏

文章目录

自定义输入需求

将下面的三个文件的内容读取到一个SequenceFile中，SequenceFile是hadoop中特有的文件格式，适合key-value的存储。比普通文件格式节省空间。现在默认的输出格式是TextOutputFormat(文本格式的输出)，改为SequenceFileOutputFormat。将输入目录的文件读取key-value(bytes)形式，将文件的内容读取封装为bytes类型，然后将文件名作为key。
在这里插入图片描述

先来看看底层的map方法

下面的方法最重要的就是nextKeyValue();此方法的内部实现将我们需要的值设为key和value，所以我们要重写该方法。

使用maptask的runNewMapper方法开始正式的map阶段
    1、根据自定义map类名，获得自定义map对象
    2、调用Mapper的run函数来运行用户自定义的map方法
        //设置相关变量或者参数，一个map只调用一次
        setup(context);
        try {
          while (context.nextKeyValue()) {
        //使用while循环调用自定义map的方法
        map(context.getCurrentKey(), context.getCurrentValue(), context);
          }
        } finally {
          //清理过程，包括清理一些没用的k-v
          cleanup(context);
        }

编写自定义输入格式

import java.io.IOException;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

/*
 * 1. 改变切片策略，一个文件固定切1片，通过指定文件不可切
 * 
 * 2. 提供RecordReader ，这个RecordReader读取切片的文件名作为key,读取切片的内容封装到bytes作为value
 */
public class MyInputFormat extends FileInputFormat {

	@Override
	public RecordReader createRecordReader(InputSplit split, TaskAttemptContext context)
			throws IOException, InterruptedException {
		
		return new MyRecordReader();
	}
	
	// 重写isSplitable
	@Override
	protected boolean isSplitable(JobContext context, Path filename) {
		
		return false;
	}
	
	

}

编写自定义记录读取器

该部分重写nextKeyValue()方法。然后在initialize中初始化一些我们需要的值。

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

/*
 * RecordReader从MapTask处理的当前切片中读取数据
 * 
 * XXXContext都是Job的上下文，通过XXXContext可以获取Job的配置Configuration对象
 */
public class MyRecordReader extends RecordReader {
	
	private Text key;
	private BytesWritable value;
	
	private String filename;
	private int length;
	
	private FileSystem fs;
	private Path path;
	
	private FSDataInputStream is;
	
	private boolean flag=true;

	// MyRecordReader在创建后，在进入Mapper的run()之前，自动调用
	// 文件的所有内容设置为1个切片，切片的长度等于文件的长度
	@Override
	public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {

		FileSplit fileSplit=(FileSplit) split;
		
		filename=fileSplit.getPath().getName();
		
		length=(int) fileSplit.getLength();
		
		path=fileSplit.getPath();
		
		//获取当前Job的配置对象
		Configuration conf = context.getConfiguration();
		
		//获取当前Job使用的文件系统
		fs=FileSystem.get(conf);
		
		 is = fs.open(path);
		
	}

	// 读取一组输入的key-value，读到返回true,否则返回false
	// 将文件的名称封装为key，将文件的内容封装为BytesWritable类型的value，返回true
	// 第二次调用nextKeyValue()返回false
	@Override
	public boolean nextKeyValue() throws IOException, InterruptedException {
		
		if (flag) {
			
			//实例化对象
			if (key==null) {
				key=new Text();
			}
			
			if (value==null) {
				value=new BytesWritable();
			}
			
			//赋值
			//将文件名封装到key中
			key.set(filename);
			
			// 将文件的内容读取到BytesWritable中
			byte [] content=new byte[length];
			
			IOUtils.readFully(is, content, 0, length);
			
			value.set(content, 0, length);
			
			flag=false;
			
			return true;
			
		}
		
		return false;
	}

	//返回当前读取到的key-value中的key
	@Override
	public Object getCurrentKey() throws IOException, InterruptedException {
		return key;
	}

	//返回当前读取到的key-value中的value
	@Override
	public Object getCurrentValue() throws IOException, InterruptedException {
		return value;
	}

	//返回读取切片的进度
	@Override
	public float getProgress() throws IOException, InterruptedException {
		return 0;
	}

	// 在Mapper的输入关闭时调用，清理工作
	@Override
	public void close() throws IOException {
		if (is != null) {
			IOUtils.closeStream(is);
		}
		
		if (fs !=null) {
			fs.close();
		}
	}

}

Mapper类

import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class CustomIFMapper extends Mapper<Text, BytesWritable, Text, BytesWritable>{

}

Reducer类

import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class CustomIFReducer extends Reducer<Text, BytesWritable, Text, BytesWritable>{

}

设置Driver

import java.io.IOException;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;

public class CustomIFDriver {
	
	public static void main(String[] args) throws Exception {
		
		Path inputPath=new Path("C:\\Users\\Lenovo\\Desktop\\PPT\\input");
		Path outputPath=new Path("C:\\Users\\Lenovo\\Desktop\\PPT\\output");
		

		//作为整个Job的配置
		Configuration conf = new Configuration();
		//保证输出目录不存在
		FileSystem fs=FileSystem.get(conf);
		
		if (fs.exists(outputPath)) {
			
			fs.delete(outputPath, true);
			
		}
		
		// ①创建Job
		Job job = Job.getInstance(conf);
		
		job.setJarByClass(CustomIFDriver.class);
		
		
		// 为Job创建一个名字
		job.setJobName("wordcount");
		
		// ②设置Job
		// 设置Job运行的Mapper，Reducer类型，Mapper,Reducer输出的key-value类型
		job.setMapperClass(CustomIFMapper.class);
		job.setReducerClass(CustomIFReducer.class);
		
		// Job需要根据Mapper和Reducer输出的Key-value类型准备序列化器，通过序列化器对输出的key-value进行序列化和反序列化
		// 如果Mapper和Reducer输出的Key-value类型一致，直接设置Job最终的输出类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(BytesWritable.class);
		
		// 设置输入目录和输出目录
		FileInputFormat.setInputPaths(job, inputPath);
		FileOutputFormat.setOutputPath(job, outputPath);
		
		// 设置输入和输出格式
		job.setInputFormatClass(MyInputFormat.class);
		job.setOutputFormatClass(SequenceFileOutputFormat.class);
		
		// ③运行Job
		job.waitForCompletion(true);
		
		
	}

}

执行结果

在这里插入图片描述

自定义输出需求

在这里插入图片描述

需求
过滤输入的log日志，包含baidu的网站输出到e:/baidu.log，不包含baidu的网站输出到e:/other.log。

实现代码

Mapper

/*
 * 1.什么时候需要Reduce
 * 		①合并
 * 		②需要对数据排序
 * 
 * 2. 没有Reduce阶段，key-value不需要实现序列化
 */
public class CustomOFMapper extends Mapper<LongWritable, Text, String, NullWritable>{
	
	@Override
	protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, String, NullWritable>.Context context)
			throws IOException, InterruptedException {
	
		String content = value.toString();
		
		context.write(content+"\r\n", NullWritable.get());
		
	}

}

自定义outputformat

public class MyOutPutFormat extends FileOutputFormat<String, NullWritable>{

	@Override
	public RecordWriter<String, NullWritable> getRecordWriter(TaskAttemptContext job)
			throws IOException, InterruptedException {
		return new MyRecordWriter(job);
	}

}

自定义记录写出

public class MyRecordWriter extends RecordWriter<String, NullWritable> {
	
	private Path baiduPath=new Path("e:/baidu.log");
	private Path otherPath=new Path("e:/other.log");
	
	private FSDataOutputStream baiduOS ;
	private FSDataOutputStream otherOS ;
	
	private FileSystem fs;
	
	private TaskAttemptContext context;

	public MyRecordWriter(TaskAttemptContext job) throws IOException {
		
			context=job;
		
			Configuration conf = job.getConfiguration();
			
			fs=FileSystem.get(conf);
			
			 baiduOS = fs.create(baiduPath);
			 otherOS = fs.create(otherPath);
	}
	

	// 负责将key-value写出到文件
	@Override
	public void write(String key, NullWritable value) throws IOException, InterruptedException {
		
		if (key.contains("baidu")) {
			
			baiduOS.write(key.getBytes());
			
			
			
		}else {
			
			otherOS.write(key.getBytes());
			
		}
		
	}

	// 关闭操作
	@Override
	public void close(TaskAttemptContext context) throws IOException, InterruptedException {
		
		if (atguguOS != null) {
			IOUtils.closeStream(baiduOS);
		}
		
		if (otherOS != null) {
			IOUtils.closeStream(otherOS);
			
		}
		
		if (fs != null) {
			fs.close();
		}
		
	}

}

driver

public class CustomOFDriver {
	
	public static void main(String[] args) throws Exception {
		
		Path inputPath=new Path("e:/mrinput/outputformat");
		Path outputPath=new Path("e:/mroutput/outputformat");
		

		//作为整个Job的配置
		Configuration conf = new Configuration();
		//保证输出目录不存在
		FileSystem fs=FileSystem.get(conf);
		
		if (fs.exists(outputPath)) {
			
			fs.delete(outputPath, true);
			
		}
		
		// ①创建Job
		Job job = Job.getInstance(conf);
		
		job.setJarByClass(CustomOFDriver.class);
		
		
		// 为Job创建一个名字
		job.setJobName("wordcount");
		
		// ②设置Job
		// 设置Job运行的Mapper，Reducer类型，Mapper,Reducer输出的key-value类型
		job.setMapperClass(CustomOFMapper.class);
		
		// 设置输入目录和输出目录
		FileInputFormat.setInputPaths(job, inputPath);
		FileOutputFormat.setOutputPath(job, outputPath);
		
		// 设置输入和输出格式
		job.setOutputFormatClass(MyOutPutFormat.class);
		
		// 取消reduce阶段
		job.setNumReduceTasks(0);
		
		// ③运行Job
		job.waitForCompletion(true);
		
		
	}

}

ygpGoogle

关注

0
点赞
踩
5

收藏

觉得还不错? 一键收藏
1
评论
MapReduce自定义输入格式和自定义输出格式

文章目录需求编写自定义输出格式编写自定义记录读取器Mapper类Reducer类设置Driver执行结果需求将下面的三个文件的内容读取到一个SequenceFile中，SequenceFile是hadoop中特有的文件格式，适合key-value的存储。比普通文件格式节省空间。现在默认的输出格式是TextOutputFormat(文本格式的输出)，改为SequenceFileOutputFormat。将输入目录的文件读取key-value(bytes)形式，将文件的内容读取封装为bytes类型，然后将文
复制链接

扫一扫