目录
2.4、多个输入(MultipleInputs 为每条输入路径指定InputFormat和mapper)
2.5、数据库输入和输出(DBInputFormat和DBOutputFormat,TableInputFormat和TableOutputFormat)
package org.apache.hadoop.mapreduce;
import java.io.IOException;
import org.apache.hadoop.classification.InterfaceAudience.Public;
import org.apache.hadoop.classification.InterfaceStability.Stable;
@Public
@Stable
public class Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT> {
public Mapper() {
}
protected void setup(Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>.Context context) throws IOException, InterruptedException {
}
protected void map(KEYIN key, VALUEIN value, Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>.Context context) throws IOException, InterruptedException {
context.write(key, value);
}
protected void cleanup(Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>.Context context) throws IOException, InterruptedException {
}
public void run(Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>.Context context) throws IOException, InterruptedException {
this.setup(context);
try {
while(context.nextKeyValue()) {
this.map(context.getCurrentKey(), context.getCurrentValue(), context);
}
} finally {
this.cleanup(context);
}
}
public abstract class Context implements MapContext<KEYIN, VALUEIN, KEYOUT, VALUEOUT> {
public Context() {
}
}
}
map函数的默认实现:直接把用户传进来的key和value直接写出去传递给reduce函数
protected void map(KEYIN key, VALUEIN value, Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>.Context context) throws IOException, InterruptedException {
context.write(key, value);
}
package org.apache.hadoop.mapreduce;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.classification.InterfaceAudience.Public;
import org.apache.hadoop.classification.InterfaceStability.Stable;
import org.apache.hadoop.mapreduce.ReduceContext.ValueIterator;
import org.apache.hadoop.mapreduce.task.annotation.Checkpointable;
@Checkpointable
@Public
@Stable
public class Reducer<KEYIN, VALUEIN, KEYOUT, VALUEOUT> {
public Reducer() {
}
protected void setup(Reducer<KEYIN, VALUEIN, KEYOUT, VALUEOUT>.Context context) throws IOException, InterruptedException {
}
protected void reduce(KEYIN key, Iterable<VALUEIN> values, Reducer<KEYIN, VALUEIN, KEYOUT, VALUEOUT>.Context context) throws IOException, InterruptedException {
Iterator var4 = values.iterator();
while(var4.hasNext()) {
VALUEIN value = var4.next();
context.write(key, value);
}
}
protected void cleanup(Reducer<KEYIN, VALUEIN, KEYOUT, VALUEOUT>.Context context) throws IOException, InterruptedException {
}
public void run(Reducer<KEYIN, VALUEIN, KEYOUT, VALUEOUT>.Context context) throws IOException, InterruptedException {
this.setup(context);
try {
while(context.nextKey()) {
this.reduce(context.getCurrentKey(), context.getValues(), context);
Iterator<VALUEIN> iter = context.getValues().iterator();
if (iter instanceof ValueIterator) {
((ValueIterator)iter).resetBackupStore();
}
}
} finally {
this.cleanup(context);
}
}
public abstract class Context implements ReduceContext<KEYIN, VALUEIN, KEYOUT, VALUEOUT> {
public Context() {
}
}
}
reduce函数的默认实现:把从map中获取的key和value循环写出去
protected void reduce(KEYIN key, Iterable<VALUEIN> values, Reducer<KEYIN, VALUEIN, KEYOUT, VALUEOUT>.Context context) throws IOException, InterruptedException {
Iterator var4 = values.iterator();
while(var4.hasNext()) {
VALUEIN value = var4.next();
context.write(key, value);
}
}
简单总结:如果用户使用默认的mapper和reducer,其结果是原样输出输入的数据。
1、MapReduce的类型简介
* 代表的含义???
2、 输入格式
2.1、输入分片与记录
// 输入文件路径
FileInputFormat.addInputPath(job, new Path(args[1]));
运行作业的客户端首先调用InputFormat的List<InputSplit> getSplits(JobContext job)计算分片,然后由application master来根据InputSplit存储的分片信息来调度集群处理这些数据。map的个数由输入分片的个数决定。
1)、FileInputFormat类
2)、FileInputFormat类的输入路径
3)、FileInputFormat类的输入分片
4)、小文件和combineFileInputFormat
每个小文件都需要一个map任务进行处理
5)、避免切分
6)、mapper中的文件信息
package org.apache.hadoop.mapreduce;
import java.io.IOException;
import org.apache.hadoop.classification.InterfaceAudience.Public;
import org.apache.hadoop.classification.InterfaceStability.Evolving;
import org.apache.hadoop.classification.InterfaceStability.Stable;
import org.apache.hadoop.mapred.SplitLocationInfo;
@Public
@Stable
public abstract class InputSplit {
public InputSplit() {
}
public abstract long getLength() throws IOException, InterruptedException;
public abstract String[] getLocations() throws IOException, InterruptedException;
@Evolving
public SplitLocationInfo[] getLocationInfo() throws IOException {
return null;
}
}
package org.apache.hadoop.mapreduce.lib.input;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.classification.InterfaceAudience.Public;
import org.apache.hadoop.classification.InterfaceStability.Evolving;
import org.apache.hadoop.classification.InterfaceStability.Stable;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.SplitLocationInfo;
import org.apache.hadoop.mapreduce.InputSplit;
@Public
@Stable
public class FileSplit extends InputSplit implements Writable {
private Path file;
private long start;
private long length;
private String[] hosts;
private SplitLocationInfo[] hostInfos;
public FileSplit() {
}
public FileSplit(Path file, long start, long length, String[] hosts) {
this.file = file;
this.start = start;
this.length = length;
this.hosts = hosts;
}
public FileSplit(Path file, long start, long length, String[] hosts, String[] inMemoryHosts) {
this(file, start, length, hosts);
this.hostInfos = new SplitLocationInfo[hosts.length];
for(int i = 0; i < hosts.length; ++i) {
boolean inMemory = false;
String[] var10 = inMemoryHosts;
int var11 = inMemoryHosts.length;
for(int var12 = 0; var12 < var11; ++var12) {
String inMemoryHost = var10[var12];
if (inMemoryHost.equals(hosts[i])) {
inMemory = true;
break;
}
}
this.hostInfos[i] = new SplitLocationInfo(hosts[i], inMemory);
}
}
public Path getPath() {
return this.file;
}
public long getStart() {
return this.start;
}
public long getLength() {
return this.length;
}
public String toString() {
return this.file + ":" + this.start + "+" + this.length;
}
public void write(DataOutput out) throws IOException {
Text.writeString(out, this.file.toString());
out.writeLong(this.start);
out.writeLong(this.length);
}
public void readFields(DataInput in) throws IOException {
this.file = new Path(Text.readString(in));
this.start = in.readLong();
this.length = in.readLong();
this.hosts = null;
}
public String[] getLocations() throws IOException {
return this.hosts == null ? new String[0] : this.hosts;
}
@Evolving
public SplitLocationInfo[] getLocationInfo() throws IOException {
return this.hostInfos;
}
}
7)、把整个文件当成一条记录处理
2.2、文本输入
2.3、二进制输入
![](https://i-blog.csdnimg.cn/blog_migrate/19d31aa762a8df4cd1ac13582d21a740.png)
2.4、多个输入(MultipleInputs 为每条输入路径指定InputFormat和mapper)
![](https://i-blog.csdnimg.cn/blog_migrate/6680406013b1dceece601be44d2bf131.png)
2.5、数据库输入和输出(DBInputFormat和DBOutputFormat,TableInputFormat和TableOutputFormat)
![](https://i-blog.csdnimg.cn/blog_migrate/75dd47580fc09266d907a5d415561e29.png)
3、输出格式
![](https://i-blog.csdnimg.cn/blog_migrate/312992584c5cf474e9a17cddc0153a01.png)
3.1、文本输出
3.2、二进制输出
![](https://i-blog.csdnimg.cn/blog_migrate/45ea2beacce8e27cf296fb23e12e63ad.png)
3.3、多个输出
![](https://i-blog.csdnimg.cn/blog_migrate/4c153b5d2734f4f51b3766008de45a91.png)
3.4、 延迟输出
参考:
《Hadoop权威指南.大数据的存储与分析.第4版》---第8章 MapReduce的类型与格式