mapreduce默认的inputformat和outputformat分别为:FileInputFormat和FileOutputFormat, 也就是从文本读,输出到文本。但是很多时候我们的源数据并非一定是文本,
输出也未必一定到文件,可能我们希望进入数据库,比如 MySQL或者HBASE。
HBASE因为默认官方已经有了相关的类,在我的博客另外一篇文章里做了介绍,就是2个类,比较简单,不做具体介绍。 这么说并不是说自定义 outputformat就难,
其实也很容易,不过是实现recordwrite和outputformat 2个类。 接下来,我会写一个简单的读写到文本的代码, 之后再在这个代码的基础上来描述如果自定义输出到MySQL,
能输出到MySQL,自然就可以输出到其他任何类型的数据库,诸如redis, mongodb等等。
一,默认输入输出类型Mapreduce
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.TreeSet;
import org.apache.commons.net.nntp.NewsgroupInfo;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.lib.NullOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SplitLineReader;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
public class TopTenOrder {
public static class TokenizerMapper extends Mapper<Object, Text, NullWritable, IntWritable> {
private TreeSet<Integer> top10 = new TreeSet<Integer>();
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
top10.add(Integer.parseInt(value.toString()));
}
public void cleanup(Context context) throws IOException, InterruptedException {
while (top10.size() > 10) {
top10.remove(top10.first());
}
Iterator iterator = top10.iterator();
while (iterator.hasNext()) {
context.write(NullWritable.get(), new IntWritable(Integer.parseInt(iterator.next().toString())));
}
}
}
public static class twopartitions extends Partitioner<NullWritable, IntWritable> implements Configurable {
@Override
public int getPartition(NullWritable key, IntWritable value, int numPartitions) {
// TODO Auto-generated method stub
return value.get() % 2;
}
public void setConf(Configuration conf) {
// TODO Auto-generated method stub
}
public Configuration getConf() {
// TODO Auto-generated method stub
return null;
}
}
public static class IntSumReducer extends Reducer<NullWritable, IntWritable, NullWritable, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> value, Context context)
throws IOException, InterruptedException {
for (IntWritable val : value) {
context.write(NullWritable.get(), val);
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "TopTenOrder");
job.setJarByClass(TopTenOrder.class);
job.setMapperClass(TokenizerMapper.class);
// job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(IntWritable.class);
job.setNumReduceTasks(2);
job.setPartitionerClass(twopartitions.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
上面的代码就是从一个文本文件读入数据,然后根据partition的取模,然后输出到不同reduce,因为取模要么是0,要么是1,所以一共就2个reduce. 输入文件内容如下:
reduce 如下:
好了,基本代码我们完成了,现在我想把输出结果进入MySQL,不想输出到文本,要怎么做呢? 那就是重新定义outputformat。 再介绍重新定义之前,我们先聊一下mapreduce大概步骤, map读取分片文件之后,然后再shuffle/sort, 发送数据到不同的reduce, reduce处理完数据再根据outputformat输出结果。 很显然outputformat是最后一个步骤,那么我们自定义outputformat和之前的map,reduce也就没太大关系,因为outputformat 是处理reduce的输出。
要自定义,需要重新定义2个类,一个是outputformat, 一个是recordwrite, recordwrite就是告诉程序,你要怎么输出,说白了你要怎么写数据,很显然整个工作大部分在recordwrite, 定义好了怎么输出,再把相关的定义传输给outputformat,那么整个程序就结束了。
RecordWrite类, 大家应该很快就看明白了,定义了怎么写入MySQL. 就是获取reduce传输的value, 然后把这个value插入到数据库里。 是否很简单?
public static class fakeRecordWrite extends RecordWriter<NullWritable, IntWritable> {
@Override
public void write(NullWritable key, IntWritable value) throws IOException, InterruptedException {
// TODO Auto-generated method stub
String driver = "com.mysql.jdbc.Driver";
String url = "jdbc:mysql://10.215.4.161:3306/test";
String user = "admin";
String password = "internal";
System.out.println("start to write" + value.toString());
try {
System.out.println("it already gets into insert function");
Class.forName(driver);
Connection conn = DriverManager.getConnection(url, user, password);
conn.setAutoCommit(true);
Statement statement = conn.createStatement();
String sql = "insert into test values (" + Integer.parseInt(value.toString()) + ")";
System.out.println(sql);
statement.execute(sql);
statement.close();
conn.close();
System.out.println("insert is successfuly " + value.toString());
} catch (Exception ex) {
ex.printStackTrace();
}
}
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
// TODO Auto-generated method stub
}
}
outputformat类:
public static class fakeOutPutFormat extends OutputFormat<NullWritable, IntWritable> {
@Override
public RecordWriter<NullWritable, IntWritable> getRecordWriter(TaskAttemptContext context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
return new fakeRecordWrite();
}
@Override
public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException {
// TODO Auto-generated method stub
}
@Override
public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException {
// TODO Auto-generated method stub
return (new org.apache.hadoop.mapreduce.lib.output.NullOutputFormat<NullWritable, IntWritable>())
.getOutputCommitter(context);
}
}
把之前定义的recordwrite 放入outputformat类, 让outputformat明白,我要根据fakerecordwrite类的定义来进行输出, 仅此而已。 另外一个就是要注意outputommitter, 这个类会跟踪MAP和reduce任务,我之前这里 直接return null 发现不行,实际这个地方说白了就是要根据 context获取相关的map reduce任务执行情况,所以需要做定义。
整个程序现在变为如下:
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.TreeSet;
import org.apache.commons.net.nntp.NewsgroupInfo;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.lib.NullOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SplitLineReader;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
public class TopTenOrder {
public static class TokenizerMapper extends Mapper<Object, Text, NullWritable, IntWritable> {
private TreeSet<Integer> top10 = new TreeSet<Integer>();
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
top10.add(Integer.parseInt(value.toString()));
}
public void cleanup(Context context) throws IOException, InterruptedException {
while (top10.size() > 10) {
top10.remove(top10.first());
}
Iterator iterator = top10.iterator();
while (iterator.hasNext()) {
context.write(NullWritable.get(), new IntWritable(Integer.parseInt(iterator.next().toString())));
}
}
}
public static class twopartitions extends Partitioner<NullWritable, IntWritable> implements Configurable {
@Override
public int getPartition(NullWritable key, IntWritable value, int numPartitions) {
// TODO Auto-generated method stub
return value.get() % 2;
}
public void setConf(Configuration conf) {
// TODO Auto-generated method stub
}
public Configuration getConf() {
// TODO Auto-generated method stub
return null;
}
}
public static class fakeOutPutFormat extends OutputFormat<NullWritable, IntWritable> {
@Override
public RecordWriter<NullWritable, IntWritable> getRecordWriter(TaskAttemptContext context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
return new fakeRecordWrite();
}
@Override
public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException {
// TODO Auto-generated method stub
}
@Override
public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException {
// TODO Auto-generated method stub
return (new org.apache.hadoop.mapreduce.lib.output.NullOutputFormat<NullWritable, IntWritable>())
.getOutputCommitter(context);
}
}
public static class fakeRecordWrite extends RecordWriter<NullWritable, IntWritable> {
@Override
public void write(NullWritable key, IntWritable value) throws IOException, InterruptedException {
// TODO Auto-generated method stub
String driver = "com.mysql.jdbc.Driver";
String url = "jdbc:mysql://10.215.4.161:3306/test";
String user = "admin";
String password = "internal";
System.out.println("start to write" + value.toString());
try {
System.out.println("it already gets into insert function");
Class.forName(driver);
Connection conn = DriverManager.getConnection(url, user, password);
conn.setAutoCommit(true);
Statement statement = conn.createStatement();
String sql = "insert into test values (" + Integer.parseInt(value.toString()) + ")";
System.out.println(sql);
statement.execute(sql);
statement.close();
conn.close();
System.out.println("insert is successfuly " + value.toString());
} catch (Exception ex) {
ex.printStackTrace();
}
}
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
// TODO Auto-generated method stub
}
}
public static class IntSumReducer extends Reducer<NullWritable, IntWritable, NullWritable, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> value, Context context)
throws IOException, InterruptedException {
for (IntWritable val : value) {
context.write(NullWritable.get(), val);
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "TopTenOrder");
job.setJarByClass(TopTenOrder.class);
job.setMapperClass(TokenizerMapper.class);
// job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(IntWritable.class);
job.setNumReduceTasks(2);
job.setPartitionerClass(twopartitions.class);
job.setOutputFormatClass(fakeOutPutFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
// FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
输出结果:
下面是运行的日志,我自己手动添加的,看看程序到底是怎么执行的:
整个程序就这么完成, 但是有很多缺陷, 从日志大家可以看到,每次插入都要建立连接,如果我有1000W条,就要建立1000W次,这肯定有大问题, 所以这个地方要修改,必须把MySQL连接的定义放到别的地方,比如放到程序开始就进行初始化,这样就只需要执行一次连接,之后调用statement执行SQL即可。
上面自定义中的有些方法我是按照默认,没有进行任何修改,大家根据需要修改,默认不做处理也是没问题的。
mapreduce默认的inputformat和outputformat分别为:FileInputFormat和FileOutputFormat, 也就是从文本读,输出到文本。但是很多时候我们的源数据并非一定是文本,输出也未必一定到文件,可能我们希望进入数据库,比如 MySQL或者HBASE。
HBASE因为默认官方已经有了相关的类,在我的博客另外一篇文章里做了介绍,就是2个类,比较简单,不做具体介绍。 这么说并不是说自定义 outputformat就难,其实也很容易,不过是实现recordwrite和outputformat 2个类。 接下来,我会写一个简单的读写到文本的代码, 之后再在这个代码的基础上来描述如果自定义输出到MySQL,能输出到MySQL,自然就可以输出到其他任何类型的数据库,诸如redis, mongodb等等。
一,默认输入输出类型Mapreduce