MapReduce--InputFormat 源码解析以及常用的类型
1 MapReduce 源代码读取数据如何分片
waitForCompletion{
submit{
submitter.submitJobInternal(Job.this, cluster){
// 来判断读取目标文件需要设置多少分片
int maps = writeSplits(job, submitJobDir){
maps = writeNewSplits(job, jobSubmitDir){
InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
List<InputSplit> splits = input.getSplits(job){
// 因为读取文件我们底层使用的抽象类都是FileInputFormat
public abstract class FileInputFormat<K, V> extends InputFormat<K, V>
FileInputFormat.getSplits{
List<InputSplit> splits = new ArrayList<InputSplit>();
// 如何得到BlockSize
List<FileStatus> files = listStatus(job){
result = singleThreadedListStatus(job, dirs, inputFilter, recursive){
RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(globStat.getPath()){
listLocatedStatus(f, DEFAULT_FILTER){
/**
* Return the number of bytes that large input files should be optimally
* be split into to minimize i/o time.
* @deprecated use {@link #getDefaultBlockSize(Path)} instead
*/
@Deprecated
public long getDefaultBlockSize() {
// default to 32MB: large enough to minimize the impact of seeks
return getConf().getLong("fs.local.block.size", 32 * 1024 * 1024);
}
}
}
}
}
long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)){
protected long getFormatMinSplitSize() {
return 1;
}
public static long getMinSplitSize(JobContext job) {
return job.getConfiguration().getLong(SPLIT_MINSIZE, 1L);
}
// 因为我们没有设置SplitSize整个参数
getFormatMinSplitSize() = 1
getMinSplitSize(job) = 1
long minSize = Math.max(1,1) = 1
}
long maxSize = getMaxSplitSize(job){
public static long getMaxSplitSize(JobContext context) {
return context.getConfiguration().getLong(SPLIT_MAXSIZE, Long.MAX_VALUE);
@Native public static final long MAX_VALUE = 0x7fffffffffffffffL;
// 因为参数没有设置
long maxSize = 0x7fffffffffffffffL;
}
}
for (FileStatus file: files) {
long blockSize = file.getBlockSize(){
// 上文已经得到了fs.local.block.size 的大小是32M
long blockSize = getConf().getLong("fs.local.block.size", 32 * 1024 * 1024)
}
long splitSize = computeSplitSize(blockSize, minSize, maxSize){
Math.max(minSize, Math.min(maxSize, blockSize)) = Math.max(1, Math.min(0x7fffffffffffffffL, 32 * 1024 * 1024)){
long splitSize = 32 * 1024 * 1024
}
}
}
// 所以这里又会出现一个经典的面试题,如果一个文件是129M,BlockSize是128M,读取文件的时候会切成多少片
// 答案是1,因为这里设置了一个1.1的阈值
while (((double) bytesRemaining)/splitSize > SPLIT_SLOP) {
private static final double SPLIT_SLOP = 1.1; // 10% slop
int blkIndex = getBlockIndex(blkLocations, length-bytesRemaining);
splits.add(makeSplit(path, length-bytesRemaining, splitSize,
blkLocations[blkIndex].getHosts(),
blkLocations[blkIndex].getCachedHosts()));
bytesRemaining -= splitSize;
}
}
}
T[] array = (T[]) splits.toArray(new InputSplit[splits.size()]);
return array.length;
}
// 所以Map的数量是是 文件的大小/BlockSize
// 本地默认的BlockSize是32M,提交到集群上面,默认的BlockSize是128M
}
}
}
}
2 MapReduce 源代码分片读取数据
2.1 InputFormat 是所有读取文件的基类
/**
* <code>InputFormat</code> describes the input-specification for a
* Map-Reduce job.
*
* <p>The Map-Reduce framework relies on the <code>InputFormat</code> of the
* job to:<p>
* <ol>
* <li>
* Validate the input-specification of the job.
* <li>
* Split-up the input file(s) into logical {@link InputSplit}s, each of
* which is then assigned to an individual {@link Mapper}.
* </li>
* <li>
* Provide the {@link RecordReader} implementation to be used to glean
* input records from the logical <code>InputSplit</code> for processing by
* the {@link Mapper}.
* </li>
* </ol>
*
* <p>The default behavior of file-based {@link InputFormat}s, typically
* sub-classes of {@link FileInputFormat}, is to split the
* input into <i>logical</i> {@link InputSplit}s based on the total size, in
* bytes, of the input files. However, the {@link FileSystem} blocksize of
* the input files is treated as an upper bound for input splits. A lower bound
* on the split size can be set via
* <a href="{@docRoot}/../hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml#mapreduce.input.fileinputformat.split.minsize">
* mapreduce.input.fileinputformat.split.minsize</a>.</p>
*
* <p>Clearly, logical splits based on input-size is insufficient for many
* applications since record boundaries are to respected. In such cases, the
* application has to also implement a {@link RecordReader} on whom lies the
* responsibility to respect record-boundaries and present a record-oriented
* view of the logical <code>InputSplit</code> to the individual task.
*
* @see InputSplit
* @see RecordReader
* @see FileInputFormat
*/
@InterfaceAudience.Public
@InterfaceStability.Stable
public abstract class InputFormat<K, V> {
/**
* Logically split the set of input files for the job.
*
* <p>Each {@link InputSplit} is then assigned to an individual {@link Mapper}
* for processing.</p>
*
* <p><i>Note</i>: The split is a <i>logical</i> split of the inputs and the
* input files are not physically split into chunks. For e.g. a split could
* be <i><input-file-path, start, offset></i> tuple. The InputFormat
* also creates the {@link RecordReader} to read the {@link InputSplit}.
*
* @param context job configuration.
* @return an array of {@link InputSplit}s for the job.
*/
public abstract
List<InputSplit> getSplits(JobContext context
) throws IOException, InterruptedException;
/**
* Create a record reader for a given split. The framework will call
* {@link RecordReader#initialize(InputSplit, TaskAttemptContext)} before
* the split is used.
* @param split the split to be read
* @param context the information about the task
* @return a new record reader
* @throws IOException
* @throws InterruptedException
*/
public abstract
RecordReader<K,V> createRecordReader(InputSplit split,
TaskAttemptContext context
) throws IOException,
InterruptedException;
- 首先是先把输入的文件分区,然后通过RecordReader读取数据
2.2 TextInputFormat
@Override
public RecordReader<LongWritable, Text>
createRecordReader(InputSplit split,
TaskAttemptContext context) {
String delimiter = context.getConfiguration().get(
"textinputformat.record.delimiter");
byte[] recordDelimiterBytes = null;
if (null != delimiter)
recordDelimiterBytes = delimiter.getBytes(Charsets.UTF_8);
return new LineRecordReader(recordDelimiterBytes){
public void initialize(InputSplit genericSplit,
TaskAttemptContext context) throws IOException {
FileSplit split = (FileSplit) genericSplit;
Configuration job = context.getConfiguration();
this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
start = split.getStart();
end = start + split.getLength();
final Path file = split.getPath();
// open the file and seek to the start of the split
final FileSystem fs = file.getFileSystem(job);
fileIn = fs.open(file);
CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
if (null!=codec) {
isCompressedInput = true;
decompressor = CodecPool.getDecompressor(codec);
if (codec instanceof SplittableCompressionCodec) {
final SplitCompressionInputStream cIn =
((SplittableCompressionCodec)codec).createInputStream(
fileIn, decompressor, start, end,
SplittableCompressionCodec.READ_MODE.BYBLOCK);
in = new CompressedSplitLineReader(cIn, job,
this.recordDelimiterBytes);
start = cIn.getAdjustedStart();
end = cIn.getAdjustedEnd();
filePosition = cIn;
} else {
in = new SplitLineReader(codec.createInputStream(fileIn,
decompressor), job, this.recordDelimiterBytes);
filePosition = fileIn;
}
} else {
fileIn.seek(start);
in = new UncompressedSplitLineReader(
fileIn, job, this.recordDelimiterBytes, split.getLength());
filePosition = fileIn;
}
// If this is not the first split, we always throw away first record
// because we always (except the last split) read one extra line in
// next() method.
if (start != 0) {
start += in.readLine(new Text(), 0, maxBytesToConsume(start));
}
this.pos = start;
}
}
}
- getSplits 是把每个分片的数据岂止偏移量和分了多少片的信息记录下来
- createRecordReader 创建了一个Reader来读取文本数据
- 是通过之前记录的每个分片的偏移量来读取数据,由此可见,RecordReader是一个逻辑分区,并不是想block那样,没有实实在在的文件
fileIn.seek(start);
in = new UncompressedSplitLineReader(
fileIn, job, this.recordDelimiterBytes, split.getLength());
filePosition = fileIn;
start += in.readLine(new Text(), 0, maxBytesToConsume(start));
3 InputFormat 常用的几种子类
- FileInputFormat
- TextInputFormat
- KeyValueTextInputFormat
- NLineInputFormat
- DBInputFormat
FileInputFormat、TextInputFormat之前文章里面已经写过DEMO了,今天就对NLineInputFormat、KeyValueTextInputFormat来分析一波
3.1 KeyValueTextInputFormat
3.1.1 需求
文件中第一个单词的词频统计
3.1.2 POM文件增加依赖
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.0-cdh5.16.2</version>
</dependency>
</dependencies>
3.1.3 数据
hadoop,spark,flink
hbase,hadoop,spark,flink
spark
hadoop
3.1.4 Code
package com.xk.bigata.hadoop.mapreduce.inputformat;
import com.xk.bigata.hadoop.utils.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueLineRecordReader;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class KeyValueTextInputFormatDriver {
public static void main(String[] args) throws Exception {
String input = "mapreduce-basic/data/wc.txt";
String output = "mapreduce-basic/out";
// 1 创建 MapReduce job
Configuration conf = new Configuration();
conf.set(KeyValueLineRecordReader.KEY_VALUE_SEPERATOR, ",");
Job job = Job.getInstance(conf);
// 删除输出路径
FileUtils.deleteFile(job.getConfiguration(), output);
// 2 设置运行主类
job.setJarByClass(KeyValueTextInputFormatDriver.class);
// 3 设置Map和Reduce运行的类
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
// 4 设置Map 输出的 KEY 和 VALUE 数据类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 5 设置Reduce 输出 KEY 和 VALUE 数据类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 6 设置输入和输出路径
job.setInputFormatClass(KeyValueTextInputFormat.class);
FileInputFormat.setInputPaths(job, new Path(input));
FileOutputFormat.setOutputPath(job, new Path(output));
// 7 提交job
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
public static class MyMapper extends Mapper<Text, Text, Text, IntWritable> {
IntWritable ONE = new IntWritable(1);
/**
* @param key 这一行数据里面,按照设定的分区符号的第一个字段
* @param value 这一行数据
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
context.write(key, ONE);
}
}
public static class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0;
for (IntWritable value : values) {
count += value.get();
}
context.write(key, new IntWritable(count));
}
}
}
3.1.5 结果
hadoop 2
hbase 1
spark 1
3.2 NLineInputFormat
3.2.1 POM文件增加依赖
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.0-cdh5.16.2</version>
</dependency>
</dependencies>
3.2.2 源代码以及Map分区原理
/**
* NLineInputFormat which splits N lines of input as one split.
*
* In many "pleasantly" parallel applications, each process/mapper
* processes the same input file (s), but with computations are
* controlled by different parameters.(Referred to as "parameter sweeps").
* One way to achieve this, is to specify a set of parameters
* (one set per line) as input in a control file
* (which is the input path to the map-reduce application,
* where as the input dataset is specified
* via a config variable in JobConf.).
*
* The NLineInputFormat can be used in such applications, that splits
* the input file such that by default, one line is fed as
* a value to one map task, and key is the offset.
* i.e. (k,v) is (LongWritable, Text).
* The location hints will span the whole mapred cluster.
*/
public List<InputSplit> getSplits(JobContext job)
throws IOException {
List<InputSplit> splits = new ArrayList<InputSplit>();
int numLinesPerSplit = getNumLinesPerSplit(job);
for (FileStatus status : listStatus(job)) {
splits.addAll(getSplitsForFile(status,
job.getConfiguration(), numLinesPerSplit));
}
return splits;
}
- NLineInputFormat 是通过getNumLinesPerSplit(job)来得到,input spilt 是以多少行作为一个分区
- 通过设定的NumLinesPerSplit用HDFS API来读取数据
FSDataInputStream in = fs.open(fileName);
lr = new LineReader(in, conf);
Text line = new Text();
int numLines = 0;
long begin = 0;
long length = 0;
int num = -1;
while ((num = lr.readLine(line)) > 0) {
numLines++;
length += num;
if (numLines == numLinesPerSplit) {
splits.add(createFileSplit(fileName, begin, length));
begin += length;
length = 0;
numLines = 0;
}
}
if (numLines != 0) {
splits.add(createFileSplit(fileName, begin, length));
}
3.2.3 需求
把一份数据按照三行作为一个分区
3.2.4 数据
hadoop,spark,flink
hbase,hadoop,spark,flink
spark
hadoop
hadoop,spark,flink
hbase,hadoop,spark,flink
spark
hadoop
hbase,hadoop,spark,flink
3.2.5 Code
package com.xk.bigata.hadoop.mapreduce.inputformat;
import com.xk.bigata.hadoop.utils.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class NLineInputFormatDriver {
public static void main(String[] args) throws Exception {
String input = "mapreduce-basic/data/wc.txt";
String output = "mapreduce-basic/out";
// 1 创建 MapReduce job
Configuration conf = new Configuration();
// conf.set(NLineInputFormat.LINES_PER_MAP,"3");
Job job = Job.getInstance(conf);
// 删除输出路径
FileUtils.deleteFile(job.getConfiguration(), output);
// 2 设置运行主类
job.setJarByClass(NLineInputFormatDriver.class);
// 3 设置Map和Reduce运行的类
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
// 4 设置Map 输出的 KEY 和 VALUE 数据类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 5 设置Reduce 输出 KEY 和 VALUE 数据类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 6 设置输入和输出路径
// job.setInputFormatClass(NLineInputFormat.class);
FileInputFormat.setInputPaths(job, new Path(input));
FileOutputFormat.setOutputPath(job, new Path(output));
// 7 提交job
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
IntWritable ONE = new IntWritable(1);
/**
* @param key : 文件的Offset
* @param value : 当前行的数据
* @param context : MapReduce 上下文,可以理解成缓存数据,把Map处理好的数据放进去
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] spilts = value.toString().split(",");
for (String word : spilts) {
context.write(new Text(word), ONE);
}
}
}
public static class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
/**
* shuffle 之后把key相同的数据全部都放到一起 eg : <hadoop,<1,1>>
*
* @param key :eg: hadoop
* @param values :eg : <1,1>
* @param context : MapReduce 上下文,把Reducec处理好的数据放进去
*/
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0;
for (IntWritable value : values) {
count += value.get();
}
context.write(key, new IntWritable(count));
}
}
}
- 把conf.set(NLineInputFormat.LINES_PER_MAP,"3")和 job.setInputFormatClass(NLineInputFormat.class)注释,查看输出控制台打印的Log
11:48:58 INFO FileInputFormat: Total input paths to process : 1
11:48:58 INFO JobSubmitter: number of splits:1
- Total input paths to process 是读取的文件个数
- number of splits : map input spilt分片的个数
- 把conf.set(NLineInputFormat.LINES_PER_MAP,"3")和 job.setInputFormatClass(NLineInputFormat.class)取消注释,查看输出控制台打印的Log
13:36:09 INFO FileInputFormat: Total input paths to process : 1
13:36:09 INFO JobSubmitter: number of splits:3
- Map读取了一个文件
- 因为数据共九条数据,设置了三条一个分片,input spilt 分片为三个
3.2.6 运行结果
flink 5
hadoop 7
hbase 3
spark 7
3.3 DBInputFormat
3.3.1 POM文件增加依赖
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.0-cdh5.16.2</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.48</version>
</dependency>
</dependencies>
3.3.2 需求
读取Mysql 中bigdata.test 中数据:
1,1111
2,1111
3,1111
4,1111
5,1111
6,1111
7,1111
8,1111
9,1111
10,1111
并把数据写入到文件里面
3.3.3 DBInputFormat源代码分析
/**
* A InputFormat that reads input data from an SQL table.
* <p>
* DBInputFormat emits LongWritables containing the record number as
* key and DBWritables as value.
*
* The SQL query, and input class can be using one of the two
* setInput methods.
*/
- 读取DB里面的数据,需要设置输入。即调用setInput这个方法
- 查看setInput入参
/**
* Initializes the map-part of the job with the appropriate input settings.
*
* @param job The map-reduce job
* @param inputClass the class object implementing DBWritable, which is the
* Java object holding tuple fields.
* @param tableName The table to read data from
* @param conditions The condition which to select data with,
* eg. '(updated > 20070101 AND length > 0)'
* @param orderBy the fieldNames in the orderBy clause.
* @param fieldNames The field names in the table
* @see #setInput(Job, Class, String, String)
*/
- inputClass the class object implementing DBWritable, which is the Java object holding tuple fields. 则需要一个类来实现DBWritable
- 查看DBWritable接口
/**
* Objects that are read from/written to a database should implement
* <code>DBWritable</code>. DBWritable, is similar to {@link Writable}
* except that the {@link #write(PreparedStatement)} method takes a
* {@link PreparedStatement}, and {@link #readFields(ResultSet)}
* takes a {@link ResultSet}.
* <p>
* Implementations are responsible for writing the fields of the object
* to PreparedStatement, and reading the fields of the object from the
* ResultSet.
*
* <p>Example:</p>
* If we have the following table in the database :
* <pre>
* CREATE TABLE MyTable (
* counter INTEGER NOT NULL,
* timestamp BIGINT NOT NULL,
* );
* </pre>
* then we can read/write the tuples from/to the table with :
* <p><pre>
* public class MyWritable implements Writable, DBWritable {
* // Some data
* private int counter;
* private long timestamp;
*
* //Writable#write() implementation
* public void write(DataOutput out) throws IOException {
* out.writeInt(counter);
* out.writeLong(timestamp);
* }
*
* //Writable#readFields() implementation
* public void readFields(DataInput in) throws IOException {
* counter = in.readInt();
* timestamp = in.readLong();
* }
*
* public void write(PreparedStatement statement) throws SQLException {
* statement.setInt(1, counter);
* statement.setLong(2, timestamp);
* }
*
* public void readFields(ResultSet resultSet) throws SQLException {
* counter = resultSet.getInt(1);
* timestamp = resultSet.getLong(2);
* }
* }
* </pre></p>
*/
- 官方给的案例里面的类,实现了Writable, DBWritable两个接口
3.3.4 Code
3.3.4.1 MysqlTestDomain
package com.xk.bigata.hadoop.mapreduce.domain;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.lib.db.DBWritable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
public class MysqlTestDomain implements Writable, DBWritable {
private int id;
private String name;
public MysqlTestDomain() {
}
@Override
public String toString() {
return id + "\t" + name;
}
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(id);
out.writeUTF(name);
}
@Override
public void readFields(DataInput in) throws IOException {
this.id = in.readInt();
this.name = in.readUTF();
}
@Override
public void write(PreparedStatement statement) throws SQLException {
statement.setInt(1, id);
statement.setString(2,name);
}
@Override
public void readFields(ResultSet resultSet) throws SQLException {
this.id = resultSet.getInt(1);
this.name = resultSet.getString(2);
}
}
3.3.4.2 DBInputFormatDriver
package com.xk.bigata.hadoop.mapreduce.inputformat;
import com.xk.bigata.hadoop.mapreduce.domain.MysqlTestDomain;
import com.xk.bigata.hadoop.utils.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.db.DBConfiguration;
import org.apache.hadoop.mapreduce.lib.db.DBInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class DBInputFormatDriver {
public static void main(String[] args) throws Exception {
String output = "mapreduce-basic/out";
// 1 创建 MapReduce job
Configuration conf = new Configuration();
// 设置JDBC连接
DBConfiguration.configureDB(conf,
"com.mysql.jdbc.Driver",
"jdbc:mysql://bigdatatest01:3306/bigdata",
"root",
"Jgw@31500");
Job job = Job.getInstance(conf);
// 删除输出路径
FileUtils.deleteFile(job.getConfiguration(), output);
// 2 设置运行主类
job.setJarByClass(NLineInputFormatDriver.class);
// 3 设置Map和Reduce运行的类
job.setMapperClass(MyMapper.class);
// 4 设置Map 输出的 KEY 和 VALUE 数据类型
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(MysqlTestDomain.class);
// 6 设置输入和输出路径
job.setInputFormatClass(DBInputFormat.class);
// 设置Mysql中的详细信息
DBInputFormat.setInput(job,
MysqlTestDomain.class,
"test",
null,
null,
"id", "name");
FileOutputFormat.setOutputPath(job, new Path(output));
// 7 提交job
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
public static class MyMapper extends Mapper<LongWritable, MysqlTestDomain, NullWritable, MysqlTestDomain> {
@Override
protected void map(LongWritable key, MysqlTestDomain value, Context context) throws IOException, InterruptedException {
context.write(NullWritable.get(), value);
}
}
}
3.3.5 结果
10 1111
9 1111
8 1111
7 1111
6 1111
5 1111
4 1111
3 1111
2 1111
1 1111