设定ReducerTask个数
设定ReducerTask个数,使用默认的HashPartitioner分区,对数据进行分区操作,提供给不同的Reducer处理
【在源码中将定义的ReducerTask数提交给Partitioner,因此是Reducer数决定了Partition分区数】
【Reducer输出从part-r-00000开始连续的】
1. 测试数据
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
public class WriteRandom {
public static void main(String[] args) throws IOException {
BufferedWriter bw = new BufferedWriter(new FileWriter("D:/age/age.txt"));
for(int i=0;i<30;i++) {
Integer n = (int)(Math.random()*100+1);
if(i == 0) {
bw.write(n.toString());
}
else {
bw.newLine();
bw.write(n.toString());
}
}
bw.close();
}
}
9
32
65
24
52
84
83
38
58
4
65
45
38
20
24
38
69
67
12
65
69
77
71
60
95
43
31
28
36
74
2. 重写TextInputFormat类
package num;
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
public class LineNumInputFormat extends FileInputFormat<LongWritable, Text> {
@Override
public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
return new LineNumRecordReader();
}
@Override
protected boolean isSplitable(JobContext context, Path filename) {
return false;
}
}
- 重写FileInputFormat类,继承于FileInputFormat实现一个LineNumInputFormat类,实现父类的RecordReader()和isSplitable()方法
- RecordReader()方法中返回一个自定义的LineNumRecordReader类
- isSplitable()方法中返回false,表示文件不可切分(如果可切分,会导致部分数据损坏)
3. 重写LineRecordReader【继承RecordReader类,实现RecordReader所有方法,参考LineRecordReader】
package num;
import java.io.IOException;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.LineReader;
public class LineNumRecordReader extends RecordReader<LongWritable, Text> {
private long start;
private long pos;
private long end;
private LineReader in;
private FSDataInputStream fileIn;
private LongWritable key;
private Text value;
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
FileSplit _split = (FileSplit) split;
Path file = _split.getPath();
FileSystem fs = file.getFileSystem(context.getConfiguration());
fileIn =fs.open(file);
fileIn.seek(start);
in = new LineReader(fileIn);
start = _split.getStart();
end = start + _split.getLength();
pos = 1;
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if(key == null) {
key = new LongWritable();
}
key.set(pos);
if(value ==null) {
value = new Text();
}
if (in.readLine(value) == 0) {
return false;
}
pos++;
return true;
}
@Override
public LongWritable getCurrentKey() throws IOException, InterruptedException {
return key;
}
@Override
public Text getCurrentValue() throws IOException, InterruptedException {
return value;
}
@Override
public float getProgress() throws IOException, InterruptedException {
return 0;
}
@Override
public void close() throws IOException {
in.close();
}
}
4. Mapper类
package num_Reducers;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class NumMapper extends Mapper<LongWritable, Text, IntWritable, IntWritable> {
IntWritable _flag_ji = new IntWritable(1);
IntWritable _flag_ou = new IntWritable(0);
IntWritable _age = new IntWritable();
@Override
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, IntWritable, IntWritable>.Context context)
throws IOException, InterruptedException {
_age.set(Integer.valueOf(value.toString()));
if(key.get() % 2 == 1) {
context.write(_flag_ji, _age);
}
else if(key.get() % 2 == 0) {
context.write(_flag_ou, _age);
}
}
}
- 根据读入的行号,进行奇偶行的判断,奇数行内容与Text(奇数)结合输出,偶数行内容与Text(偶数)结合输出
1.4 Reducer类
package num_Reducers;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class NumReducer extends Reducer<IntWritable, IntWritable, IntWritable, Text> {
Text _avg = new Text();
@Override
protected void reduce(IntWritable key, Iterable<IntWritable> values,
Reducer<IntWritable, IntWritable, IntWritable, Text>.Context context) throws IOException, InterruptedException {
int sum = 0;
int n = 0;
for (IntWritable value : values) {
sum += value.get();
n++;
}
if(key.get() == 0) {
_avg.set("偶数行:" + (float) (sum/(n*1.0)));
}
else {
_avg.set("奇数行:" + (float) (sum/(n*1.0)));
}
context.write(key, _avg);
}
}
- 处理就是遍历奇数行集合求和,记录奇数行个数,取平均值
- 遍历偶数行求和,记录偶数行个数,取平均值
- 写入输出
1.5 Driver类
package num_Reducers;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class NumDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
conf.set("mapreduce.framework.name", "local");
Path outPut = new Path("file:///D:/out");
FileSystem fs = outPut.getFileSystem(conf);
if(fs.exists(outPut)) {
fs.delete(outPut, true);
}
Job job = Job.getInstance(conf);
job.setJobName("age");
job.setJarByClass(NumDriver.class);
job.setMapperClass(NumMapper.class);
job.setReducerClass(NumReducer.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputValueClass(FloatWritable.class);
job.setInputFormatClass(LineNumInputFormat.class);
job.setNumReduceTasks(4);
FileInputFormat.addInputPath(job, new Path("file:///D:/age"));
FileOutputFormat.setOutputPath(job, outPut);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
- Mapper输入<LongWritable,Text>,输出<Text,IntWritable>,Reducer输入<Text,IntWritable>,输出<Text,FloatWritable>,因此不用设置Mapper的输出数据结构
- 指定输入的格式为自定义的LineNumInputFormat类
- 指定ReducerTasks个数为4,看业务数,确定有几个是有内容的
1.6 结果
part-r-00000
0 偶数行:46.6
part-r-00001
1 奇数行:51.8
part-r-00002
part-r-00003
【另外两个都是空】
不执行Reducer
不执行Reducer,处理数据,合并奇偶行数据为一行数据
1. 样例数据
123
abc
345
def
189
ghi
789
nmb
2. 预期结果
123 abc
345 def
189 ghi
789 nmb
3. 自定义InputFormat
package word;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
public class WordNumInputFormat extends FileInputFormat<LongWritable, Text> {
@Override
public RecordReader<LongWritable, Text>
createRecordReader(InputSplit split, TaskAttemptContext context) {
return new WordNumRecordReader();
}
@Override
protected boolean isSplitable(JobContext context, Path file) {
return false;
}
}
4. 自定义RecordReader
package word;
import java.io.IOException;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.LineReader;
public class WordNumRecordReader extends RecordReader<LongWritable, Text> {
private long start;
private long pos;
private long end;
private LineReader in;
private FSDataInputStream fileIn;
private LongWritable key;
private Text value;
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
FileSplit _split = (FileSplit) split;
Path file = _split.getPath();
FileSystem fs = file.getFileSystem(context.getConfiguration());
fileIn =fs.open(file);
fileIn.seek(start);
in = new LineReader(fileIn);
start = _split.getStart();
end = start + _split.getLength();
pos = 1;
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if(key == null) {
key = new LongWritable();
}
key.set(pos);
if(value ==null) {
value = new Text();
}
if (in.readLine(value) == 0) {
return false;
}
pos++;
return true;
}
@Override
public LongWritable getCurrentKey() throws IOException, InterruptedException {
return key;
}
@Override
public Text getCurrentValue() throws IOException, InterruptedException {
return value;
}
@Override
public float getProgress() throws IOException, InterruptedException {
return 0;
}
@Override
public void close() throws IOException {
in.close();
}
}
定义行号阅读器
5. Mapper
package word;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class WordMapper extends Mapper<LongWritable, Text, IntWritable, Text> {
private static Text _cun = new Text();
private static IntWritable _num = new IntWritable();
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, IntWritable, Text>.Context context)
throws IOException, InterruptedException {
if(key.get()%2 == 0) {
_cun = value;
context.write(_num, _cun);
}
else {
String line = value.toString();
_num.set(Integer.parseInt(line));
}
}
}
通过读取奇偶行,将奇数行存储,和偶数行一起输出
6. Driver
package word;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordDriver {
public static void main(String[] args) throws ClassNotFoundException, InterruptedException, IOException {
Configuration conf = new Configuration();
conf.set("mapreduce.framework.name", "local");
Path outfile = new Path("file:///D:/out");
FileSystem fs = outfile.getFileSystem(conf);
if(fs.exists(outfile)){
fs.delete(outfile,true);
}
Job job = Job.getInstance(conf);
job.setJobName("word");
job.setJarByClass(WordDriver.class);
job.setMapperClass(WordMapper.class);
job.setInputFormatClass(WordNumInputFormat.class);
job.setNumReduceTasks(0);
FileInputFormat.addInputPath(job, new Path("file:///D:/word"));
FileOutputFormat.setOutputPath(job, outfile);
System.exit(job.waitForCompletion(true)?0:1);
}
}
不使用Reducer(进阶)
1. InputFormat
package word_plus;
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
public class WordInputFormat extends FileInputFormat<Text, Text> {
@Override
public RecordReader<Text, Text> createRecordReader(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
return new WordRecordReader();
}
@Override
protected boolean isSplitable(JobContext context, Path filename) {
return false;
}
}
自定义InputFormat,指定输入的数据泛型为<Text, Text>
2. RecordReader
package word_plus;
import java.io.IOException;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.LineReader;
public class WordRecordReader extends RecordReader<Text, Text> {
private long start;
private long pos;
private long end;
private LineReader in;
private FSDataInputStream fileIn;
private Text key;
private Text value;
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
FileSplit _split = (FileSplit) split;
Path file = _split.getPath();
FileSystem fs = file.getFileSystem(context.getConfiguration());
fileIn = fs.open(file);
fileIn.seek(start);
in = new LineReader(fileIn);
start = _split.getStart();
end = start + _split.getLength();
pos = 1;
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if(key == null) {
key = new Text();
}
if(value == null) {
value = new Text();
}
in.readLine(key);
if(in.readLine(value) == 0) {
return false;
}
return true;
}
@Override
public Text getCurrentKey() throws IOException, InterruptedException {
return key;
}
@Override
public Text getCurrentValue() throws IOException, InterruptedException {
return value;
}
@Override
public float getProgress() throws IOException, InterruptedException {
return 0;
}
@Override
public void close() throws IOException {
in.close();
}
}
- 自定义输入的RecordReader的数据泛型为<Text, Text>
- 使用in.readLine(key)和in.readLine(value),调用两次in.readLine(),在一次InputFormat中读取两次,也就是读取两行记录,分别赋值给key和value
3. Mapper
package word_plus;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class WordMapper extends Mapper<Text, Text, Text, Text> {
@Override
protected void map(Text key, Text value, Mapper<Text, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
context.write(key, value);
}
}
Mapper中直接将key和value输出,就得到最终结果
2. Driver
package word_plus;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
conf.set("mapreduce.framework.name", "local");
Path outfile = new Path("file:///D:/out");
FileSystem fs = outfile.getFileSystem(conf);
if(fs.exists(outfile)){
fs.delete(outfile,true);
}
Job job = Job.getInstance(conf);
job.setJobName("word");
job.setJarByClass(WordDriver.class);
job.setMapperClass(WordMapper.class);
job.setInputFormatClass(WordInputFormat.class);
job.setNumReduceTasks(0);
FileInputFormat.addInputPath(job, new Path("file:///D:/word"));
FileOutputFormat.setOutputPath(job, outfile);
System.exit(job.waitForCompletion(true)?0:1);
}
}