今天晚上研究了一下DBInputFormat和DBOutputFormat。在cloudera的说明下,最终跑通了2个例子程序http://www.cloudera.com/blog/2009/03/06/database-access-with-hadoop/。虽然理论部分还没有完全清楚,但总算有了一点收获。cloudera上没有完整的代码,尤其是DBOutputFormat部分只有思想,我试了很多次,才写好程序。
一、DBInputFormat
1、MySql建表(和cloudera一样)
CREATE TABLE employees (
employee_id INTEGER NOT NULL PRIMARY KEY,
name VARCHAR(32) NOT NULL);
2、MyRecord.java
class MyRecord implements Writable, DBWritable {
long id;
String name;
public void readFields(DataInput in) throws IOException {
this.id = in.readLong();
this.name = Text.readString(in);
}
public void readFields(ResultSet resultSet)
throws SQLException {
this.id = resultSet.getLong(1);
this.name = resultSet.getString(2);
}
public void write(DataOutput out) throws IOException {
out.writeLong(this.id);
Text.writeString(out, this.name);
}
public void write(PreparedStatement stmt) throws SQLException {
stmt.setLong(1, this.id);
stmt.setString(2, this.name);
}
}
3、MyMapper.java
public class MyMapper extends MapReduceBase implements
Mapper<LongWritable, MyRecord, LongWritable, Text> {
public void map(LongWritable key, MyRecord val,
OutputCollector<LongWritable, Text> output, Reporter reporter)
throws IOException {
// Use val.id, val.name here
output.collect(new LongWritable(val.id), new Text(val.name));
}
}
4、Export.java
public class Export{
public static void main(String[] args) {
JobClient client = new JobClient();
JobConf conf = new JobConf(Export.class);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
Random ran = new Random();
int i = ran.nextInt(1000);
//因为程序可能要跑多次,防止文件夹重名
FileOutputFormat.setOutputPath(conf, new Path("Out"+i));
conf.setInputFormat(DBInputFormat.class);
DBConfiguration.configureDB(conf,
"com.mysql.jdbc.Driver",
"jdbc:mysql://127.0.0.1:3306/hadoop","root","123456");
String [] fields = { "employee_id", "name" };
//这是cloudera上的原始代码,我修改了,使用另外一个setInput方法
//DBInputFormat.setInput(conf, MyRecord.class, "employees", null /* conditions */, "employee_id", fields);
//test 是一张和employees结构完全一样的表,我只是为了测试DBInputFormat是否支持多表查询
//"select count(name) from employees"是hadoop用来估计工作量的
DBInputFormat.setInput(conf, MyRecord.class, "select employee_id, employees.name from employees,test where employees.name = test.name ", "select count(name) from employees");
conf.setMapperClass(MyMapper.class);
conf.setOutputKeyClass(LongWritable.class);
conf.setOutputValueClass(Text.class);
client.setConf(conf);
try {
JobClient.runJob(conf);
} catch (Exception e) {
e.printStackTrace();
}
}
}
二、DBOutputFormat
1、依然写入employee表
2、MyRecord.java代码同一
3、MyReducer.java
public class MyReducer extends MapReduceBase implements Reducer<LongWritable, Text, MyRecord, Text> {
public void reduce(LongWritable key, Iterator<Text> values,
OutputCollector<MyRecord, Text> output,
Reporter reporter) throws IOException {
String[] splits = values.next().toString().split(" ");
MyRecord r = new MyRecord();
r.id = Long.parseLong(splits[0]);
r.name=splits[1];
output.collect(r, new Text(r.name));
//其实我有一个疑惑,我试过OutputCollect<MyRecord,MyRecord>的形式,也可以跑通。
//不知道这里的OutputCollector到底做了些什么?
}
}
4、Import.java
public class Import{
public static void main(String[] args) {
JobClient client = new JobClient();
JobConf conf = new JobConf(Import.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(DBOutputFormat.class);
//In2是在hdfs中已经存在的目录,里面的文件存放的数据格式为:数字 单词
//其实是以前跑wordcount的结果
FileInputFormat.setInputPaths(conf, new Path("In2"));
DBConfiguration.configureDB(conf,
"com.mysql.jdbc.Driver",
"jdbc:mysql://127.0.0.1:3306/hadoop","root","123456");
DBOutputFormat.setOutput(conf, "employees", "employee_id", "name");
conf.setMapperClass(org.apache.hadoop.mapred.lib.IdentityMapper.class);
conf.setReducerClass(MyReducer.class);
client.setConf(conf);
try {
JobClient.runJob(conf);
} catch (Exception e) {
e.printStackTrace();
}
}
}
太晚了,最近连续晚睡。平安夜还在写代码,悲哉。睡觉去了,明天再来解决疑惑。