理解KeyValueTextInputFormat
处理每一行均为一条记录, 被分隔符(缺省是tab(\t))分割为key(Text),value(Text)
例子:
输入是一个包含4条记录的分片。其中——>表示一个(水平方向的)制表符。
line1 ——>Rich learning form
line2 ——>Intelligent learning engine
line3 ——>Learning more convenient
line4 ——>From the real demand for more close to the enterprise
每条记录表示为以下键/值对:
(line1,Rich learning form)
(line2,Intelligent learning engine)
(line3,Learning more convenient)
(line4,From the real demand for more close to the enterprise)
编写KVTextMapper类
package com.dev1.keyvalue;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class KVTextMapper extends Mapper<Text,Text,Text, LongWritable> {
private LongWritable v=new LongWritable();
@Override
protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
// LongWritable v=new LongWritable();
System.out.println(key.toString()+" "+value.toString());
v.set(1);
context.write(key,v);
}
}
编写KVTextReducer类
package com.dev1.keyvalue;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class KVTextReducer extends Reducer<Text, LongWritable, Text,LongWritable> {
private LongWritable v=new LongWritable();
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long sum=0;
//第四章:11FileInputFormat接口实现类nline 04:15
for(LongWritable value: values){
sum+=value.get();
}
v.set(sum);
context.write(key,v);
}
}
编写KVTextDriver类
package com.dev1.keyvalue;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueLineRecordReader;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class KVTextDriver {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
// 设置切割符
conf.set(KeyValueLineRecordReader.KEY_VALUE_SEPERATOR, "\t");//要自己设置分割符
// 1 获取job对象
Job job = Job.getInstance(conf);
// 2 设置jar包位置,关联mapper和reducer
job.setJarByClass(KVTextDriver.class);
job.setMapperClass(KVTextMapper.class);
job.setReducerClass(KVTextReducer.class);
// 3 设置map输出kv类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
// 4 设置最终输出kv类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
// 5 设置输入输出数据路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
// 设置输入格式
job.setInputFormatClass(KeyValueTextInputFormat.class);
// 6 设置输出数据路径
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 7 提交job
job.waitForCompletion(true);
}
}
运行