1.MapReduce概述
MapReduce是一个由Java语言编写,运行在Hadoop集群上的分布式运算程序框架,是基于Hadoop数据分析应用的核心框架。
2.MapReduce的特点
- 优点:
- 易于编程,实现简单的接口就可以完成一个分布式的程序。
- 良好的拓展性,当计算能力不够时可以通过增加机器的数量来提高运算能力。
- 大数据量的统计与计算,可达到PB量级以上的运算性能。
- 缺点:
- 不适合做实事数据的计算,他处理的一般都是打包好的离线数据
- 不适合做流式计算,MapReduce在设计的时候,数据的输入形式是静态,不能发生变化,因此决定了MapReduce的数据源也应该是静态的
3.MapReduce的运行流程
Mapper阶段
- 用户自定义的Mapper要继承自己的父类
- Mapper的输入数据是KV对的形式(KV的类型可自定义)
- Mapper中的业务逻辑写在map()方法中
- Mapper的输出数据是KV对的形式(KV的类型可自定义)
- map()方法(maptask进程)对每一个<K,V>调用一次
Reduce阶段
- 用户自定义的Reducer要继承自己的父类
- Reducer的输入数据类型对应Mapper的输出数据类型,也是KV
- Reducer的业务逻辑写在reduce()方法中
- Reducetask进程对每一组相同k的<k,v>组调用一次reduce()方法
Driver阶段
- 整个程序需要一个Drvier来进行提交,提交的是一个描述了各种必要信息的job对象
4.案例
这里列举一个网站的后台日志文件,以IP为Key,统计其他参数的统计结果
1.项目目录结构
2.Driver
- MR_InPutFormatDriver
package MR_Test.Driver;
import MR_Test.InputFormat.MyInPutFormat;
import MR_Test.Utils.LongFormat;
import MR_Test.pojo.Web_Input;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class MR_InPutFormatDriver {
/**
* 处理接收到的文本信息
* 发送处理好的文本交给Reduce进行下一步处理
*/
public static class MR_WebMapper extends Mapper<Text, Text, Text, Web_Input> {
Text k = new Text();
Web_Input v = new Web_Input();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
}
@Override
protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
//将拿到的文本信息转化成字符串
String lines = value.toString();
//将整个文件按换行符切分成若干行
String[] line = lines.split("\n");
for (String s : line) {
//将每行内容按空格进行切分
String[] split = s.split(" ");
//获取得到数组的第一个元素即为需要拿到的IP
String IP = split[0];
//取到第九位元素判断响应码是否为200
boolean cord = "200".equals(split[8]);
//条件为真赋值为1
long isSuccess = cord ? 1 : 0;
//条件为假赋值为0
long isFiled = cord ? 0 : 1;
//该IP出现一次访问数量记为1
long viewCount = 1;
//拿到上传的量
long upCount = LongFormat.tolong(split[9]);
//拿到下载量
long downCount = LongFormat.tolong(split[split.length - 1]);
//将IP封装为Key
k.set(IP);
//创建一个Web_Input对象用于封装获取到值并封装为Value
v = new Web_Input(viewCount, isSuccess, isFiled, upCount, downCount);
//使用上下对象将Key,Value发送到下一步的Reduce
context.write(k, v);
}
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
}
}
/**
* 接收Mapper传过来的文本信息
* 做聚合统计处理
*/
public static class MR_WebReduce extends Reducer<Text,Web_Input,Text,Web_Input> {
Web_Input web_input = new Web_Input();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
}
@Override
protected void reduce(Text key, Iterable<Web_Input> values, Context context) throws IOException, InterruptedException {
//循环遍历Mapper传过来的Value,并做聚合
for (Web_Input value : values) {
web_input.add(value);
}
//使用上下文对象继续向下传递
context.write(key,web_input);
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
}
}
/**
* 加载Mapper和Reduce类
* @param args 传入需要处理的文件路径以及输出的文件路径
* @throws Exception
*/
public static void main(String[] args) throws Exception {
//0. 创建配置文件对象并添加个性化配置
Configuration conf = new Configuration();
//1. 创建Job实例
Job job = Job.getInstance(conf);
//2. 设置Job运行的Driver类
job.setJarByClass(MR_WebDriver.class);
//3. 设置Job运行的Mapper类
job.setMapperClass(MR_WebMapper.class);
//4. 设置Job运行的Reducer类
job.setReducerClass(MR_WebReduce.class);
//5. 设置JobMapper输出的key的类型
job.setMapOutputKeyClass(Text.class);
//6. 设置JobMapper输出的value的类型
job.setMapOutputValueClass(Web_Input.class);
//7. 设置JobReducer输出的key的类型
job.setOutputKeyClass(Text.class);
//8. 设置JobReducer输出的value的类型
job.setOutputValueClass(Web_Input.class);
// 设置自定义文件读取方式
job.setInputFormatClass(MyInPutFormat.class);
//9. 设置Job读取数据的输入路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
//10. 设置Job写出数据的输出路径
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
- MR_OutPutFormatDriver
package MR_Test.Driver;
import MR_Test.OutPutFormat.MyOutputFormat;
import MR_Test.Utils.LongFormat;
import MR_Test.pojo.Web_Input;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class MR_OutPutFormatDriver {
/**
* 处理接收到的文本信息
* 发送处理好的文本交给Reduce进行下一步处理
*/
public static class MR_WebMapper extends Mapper<LongWritable, Text, Text, Web_Input> {
Text k = new Text();
Web_Input v = new Web_Input();
@Override
protected void setup(Context context) {
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//将拿到的文本信息转化成字符串
String lines = value.toString();
//将每行文字按空格切分
String[] split = lines.split(" ");
//获取得到数组的第一个元素即为需要拿到的IP
String IP = split[0];
//取到第九位元素判断响应码是否为200
boolean cord = "200".equals(split[8]);
//条件为真赋值为1
long isSuccess = cord ? 1 : 0;
//条件为假赋值为0
long isFiled = cord ? 0 : 1;
//该IP出现一次访问数量记为1
long viewCount = 1;
//拿到上传的量
long upCount = LongFormat.tolong(split[9]);
//拿到下载量
long downCount = LongFormat.tolong(split[split.length - 1]);
//将IP封装为Key
k.set(IP);
//创建一个Web_Input对象用于封装获取到值并封装为Value
v = new Web_Input(viewCount,isSuccess,isFiled,upCount,downCount);
//使用上下对象将Key,Value发送到下一步的Reduce
context.write(k, v);
}
@Override
protected void cleanup(Context context) {
}
}
/**
* 接收Mapper传过来的文本信息
* 做聚合统计处理
*/
public static class MR_WebReduce extends Reducer<Text,Web_Input,Text,Web_Input> {
Web_Input web_input = new Web_Input();
@Override
protected void setup(Context context) {
}
@Override
protected void reduce(Text key, Iterable<Web_Input> values, Context context) throws IOException, InterruptedException {
//循环遍历Mapper传过来的Value,并做聚合
for (Web_Input value : values) {
web_input.add(value);
}
//使用上下文对象继续向下传递
context.write(key,web_input);
}
@Override
protected void cleanup(Context context) {
}
}
/**
* 加载Mapper和Reduce类
* @param args 传入需要处理的文件路径以及输出的文件路径
* @throws Exception 异常
*/
public static void main(String[] args) throws Exception {
//0. 创建配置文件对象并添加个性化配置
Configuration conf = new Configuration();
//1. 创建Job实例
Job job = Job.getInstance(conf);
//2. 设置Job运行的Driver类
job.setJarByClass(MR_WebDriver.class);
//3. 设置Job运行的Mapper类
job.setMapperClass(MR_WebMapper.class);
//4. 设置Job运行的Reducer类
job.setReducerClass(MR_WebReduce.class);
//5. 设置JobMapper输出的key的类型
job.setMapOutputKeyClass(Text.class);
//6. 设置JobMapper输出的value的类型
job.setMapOutputValueClass(Web_Input.class);
//7. 设置JobReducer输出的key的类型
job.setOutputKeyClass(Text.class);
//8. 设置JobReducer输出的value的类型
job.setOutputValueClass(Web_Input.class);
// 自定义输出格式化
job.setOutputFormatClass(MyOutputFormat.class);
// 设置reducer 并发度
//job.setNumReduceTasks(9);
// 设置自定义分区器
//job.setPartitionerClass(Partitioner.class);
//9. 设置Job读取数据的输入路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
//10. 设置Job写出数据的输出路径
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
- MR_WebDriver
package MR_Test.Driver;
import MR_Test.Utils.LongFormat;
import MR_Test.pojo.Web_Input;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class MR_WebDriver {
/**
* 处理接收到的文本信息
* 发送处理好的文本交给Reduce进行下一步处理
*/
public static class MR_WebMapper extends Mapper<LongWritable, Text, Text, Web_Input> {
Text k = new Text();
Web_Input v = new Web_Input();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//将拿到的文本信息转化成字符串
String lines = value.toString();
//将每行文字按空格切分
String[] split = lines.split(" ");
//获取得到数组的第一个元素即为需要拿到的IP
String IP = split[0];
//取到第九位元素判断响应码是否为200
boolean cord = "200".equals(split[8]);
//条件为真赋值为1
long isSuccess = cord ? 1 : 0;
//条件为假赋值为0
long isFiled = cord ? 0 : 1;
//该IP出现一次访问数量记为1
long viewCount = 1;
//拿到上传的量
long upCount = LongFormat.tolong(split[9]);
//拿到下载量
long downCount = LongFormat.tolong(split[split.length - 1]);
//将IP封装为Key
k.set(IP);
//创建一个Web_Input对象用于封装获取到值并封装为Value
v = new Web_Input(viewCount,isSuccess,isFiled,upCount,downCount);
//使用上下对象将Key,Value发送到下一步的Reduce
context.write(k, v);
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
}
}
/**
* 接收Mapper传过来的文本信息
* 做聚合统计处理
*/
public static class MR_WebReduce extends Reducer<Text,Web_Input,Text,Web_Input> {
Web_Input web_input = new Web_Input();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
}
@Override
protected void reduce(Text key, Iterable<Web_Input> values, Context context) throws IOException, InterruptedException {
//循环遍历Mapper传过来的Value,并做聚合
for (Web_Input value : values) {
web_input.add(value);
}
//使用上下文对象继续向下传递
context.write(key,web_input);
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
}
}
/**
* 加载Mapper和Reduce类
* @param args 传入需要处理的文件路径以及输出的文件路径
* @throws Exception
*/
public static void main(String[] args) throws Exception {
//0. 创建配置文件对象并添加个性化配置
Configuration conf = new Configuration();
//1. 创建Job实例
Job job = Job.getInstance(conf);
//2. 设置Job运行的Driver类
job.setJarByClass(MR_WebDriver.class);
//3. 设置Job运行的Mapper类
job.setMapperClass(MR_WebMapper.class);
//4. 设置Job运行的Reducer类
job.setReducerClass(MR_WebReduce.class);
//5. 设置JobMapper输出的key的类型
job.setMapOutputKeyClass(Text.class);
//6. 设置JobMapper输出的value的类型
job.setMapOutputValueClass(Web_Input.class);
//7. 设置JobReducer输出的key的类型
job.setOutputKeyClass(Text.class);
//8. 设置JobReducer输出的value的类型
job.setOutputValueClass(Web_Input.class);
//9. 设置Job读取数据的输入路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
//10. 设置Job写出数据的输出路径
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
3.InputFormat
- MyInPutFormat
package MR_Test.InputFormat;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import java.io.IOException;
/**
* 自定义文件读取方式
* 将文件的文件名作为key
* 将文件的完整文件信息作为value
* 进行读取
*/
public class MyInPutFormat extends FileInputFormat<Text,Text> {
/**
*
* @param inputSplit 存储了文件的所有信息
* @param taskAttemptContext 上下文对象
* @return
* @throws IOException
* @throws InterruptedException
*/
@Override
public RecordReader<Text, Text> createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
MyRecodeRead recodeRead = new MyRecodeRead();
recodeRead.initialize(inputSplit,taskAttemptContext);
return recodeRead;
}
}
- MyRecodeRead
package MR_Test.InputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
/**
* 记录读取器对象,负责进行IO操作
* 根据分片信息将数据从HDFS或者其他文件系统上
* 读取到内存中并封装成key,value的形式
* 传递给Map进行下一步操作
*/
public class MyRecodeRead extends RecordReader<Text,Text> {
private FileSystem fs = null;
private FileSplit fileSplit = null;
Text k = new Text();
Text v = new Text();
boolean flag = true;
/**
*
* @param inputSplit 正在处理的文件分片信息
* @param taskAttemptContext 上下文对象用于传递对象
* @throws IOException
* @throws InterruptedException
*/
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
//上下文对象中包含配置信息
Configuration configuration = taskAttemptContext.getConfiguration();
//使用配置信息创建文件系统对象
fs = FileSystem.get(configuration);
//分片信息中包含需要读取的文件路径
fileSplit = (FileSplit) inputSplit;
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (flag){
//读取整个文件
//1.从文件分片中获取文件路径
Path path = fileSplit.getPath();
//2.使用文件系统对象开启输入流读取文件
FSDataInputStream inputStream = fs.open(path);
//3.从文件分片信息中获取文件的长度
long length = fileSplit.getLength();
//4.开启内存的缓冲区
byte[] bytes = new byte[(int) length];
//5.读取整个文件内容
inputStream.read(bytes);
//6.将读取到的数据转成string类型
String content = new String(bytes);
//7.拿到文件名称
String name = path.getName();
//8.将文件名作为key,将文件内容作为value输出
k.set(name);
v.set(content);
//9.关闭流
inputStream.close();
flag = false;
return true;
}
return false;
}
@Override
public Text getCurrentKey() throws IOException, InterruptedException {
return k;
}
@Override
public Text getCurrentValue() throws IOException, InterruptedException {
return v;
}
@Override
public float getProgress() throws IOException, InterruptedException {
return 0;
}
@Override
public void close() throws IOException {
fs.close();
}
}
4.OutPutFormat
- MyOutputFormat
package MR_Test.OutPutFormat;
import MR_Test.pojo.Web_Input;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* Reducer将处理完的结果交给MR框架
* MR框架使用设置的OutputFormat类对数据进行输出
* 需求:
* 将ip开头不同的数据输出到对应文件
* 文件名为 "ip开头为n的数据.txt"
* 如果想要改变输出文件的名称和路径,
* 需要自定义OutputFormat类实现数据输出的IO操作
*/
public class MyOutputFormat
extends FileOutputFormat<Text, Web_Input> {
@Override
public RecordWriter<Text, Web_Input> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
return new MyRecordWriter(job);
}
}
- MyRecordWriter
package MR_Test.OutPutFormat;
import MR_Test.pojo.Web_Input;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import java.io.IOException;
import java.io.OutputStream;
import java.util.HashMap;
public class MyRecordWriter extends RecordWriter<Text, Web_Input> {
HashMap<Character, OutputStream> outputStreamMap = new HashMap<>();
public MyRecordWriter(TaskAttemptContext context) {
try {
Configuration conf = context.getConfiguration();
String parentPath = conf.get("custom.output.path");
//获取文件系统对象
FileSystem fs = FileSystem.get(conf);
// 创建对应文件的输出流
for (int i = 1; i < 10; i++) {
char head = (char) (i + '0');
Path path = new Path(parentPath + "/ip开头为" + i + "的数据.txt");
FSDataOutputStream outputStream = fs.create(path);
// FileOutputStream outputStream = new FileOutputStream(path.toString());
outputStreamMap.put(head, outputStream);
}
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* write方法负责使用IO操作将reducer处理完的结果写出到文件
*
* @param key reducer处理完的key
* @param value reducer处理完的value
* @throws IOException
* @throws InterruptedException
*/
@Override
public void write(Text key, Web_Input value) throws IOException, InterruptedException {
char head = key.toString().charAt(0);
OutputStream outputStream = outputStreamMap.get(head);
String outputStr = key + "\t" + value.toString()+"\n";
outputStream.write(outputStr.getBytes());
outputStream.flush();
}
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
for (OutputStream value : outputStreamMap.values()) {
value.flush();
value.close();
}
}
}
5.Partitioner
- IPHeadPartitioner
package MR_Test.Partitioner;
import MR_Test.pojo.Web_Input;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class IPHeadPartitioner extends Partitioner<Text, Web_Input> {
@Override
public int getPartition(Text text, Web_Input ipInfo, int numPartitions) {
String ip = text.toString();
char head = ip.charAt(0);
return head - '0' - 1;
}
}
6.pojo
- Web_Input
package MR_Test.pojo;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class Web_Input implements Writable {
private long viewCount;
private long isSuccessCount;
private long isFiled;
private long upCount;
private long downCount;
public Web_Input() {
}
public Web_Input(long viewCount, long isSuccessCount, long isFiled, long upCount, long downCount) {
this.viewCount = viewCount;
this.isSuccessCount = isSuccessCount;
this.isFiled = isFiled;
this.upCount = upCount;
this.downCount = downCount;
}
public void add(Web_Input other){
this.viewCount += other.viewCount;
this.isSuccessCount += other.isSuccessCount;
this.isFiled += other.isFiled;
this.upCount += other.upCount;
this.downCount += other.downCount;
}
public long getViewCount() {
return viewCount;
}
public void setViewCount(long viewCount) {
this.viewCount = viewCount;
}
public long getIsSuccessCount() {
return isSuccessCount;
}
public void setIsSuccessCount(long isSuccessCount) {
this.isSuccessCount = isSuccessCount;
}
public long getIsFiled() {
return isFiled;
}
public void setIsFiled(long isFiled) {
this.isFiled = isFiled;
}
public long getUpCount() {
return upCount;
}
public void setUpCount(long upCount) {
this.upCount = upCount;
}
public long getDownCount() {
return downCount;
}
public void setDownCount(long downCount) {
this.downCount = downCount;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeLong(viewCount);
out.writeLong(isSuccessCount);
out.writeLong(isFiled);
out.writeLong(upCount);
out.writeLong(downCount);
}
@Override
public void readFields(DataInput in) throws IOException {
this.viewCount = in.readLong();
this.isSuccessCount = in.readLong();
this.isFiled = in.readLong();
this.upCount = in.readLong();
this.downCount = in.readLong();
}
@Override
public String toString() {
return viewCount+"\t"+
isSuccessCount+"\t"+
isFiled+"\t"+
upCount+"\t"+
downCount;
}
}
7.Utils
- LongFormat
package MR_Test.Utils;
public class LongFormat {
public static long tolong(String string) {
long l = 0L;
try {
l = Long.parseLong(string);
} catch (NumberFormatException e) {
}
return l;
}
}