注意重写连接方式
本文引用的是MongoDB-2.9.2的依赖,3.0以后连接数据库的方式不一样了,注意更改,或者导入2.9.2的包也行
1. 导入依赖pom.xml
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
<version>2.9.2</version>
</dependency>
数据:
db.students.insert({name:"bingbing",age:16,sex:"F"})
db.students.insert({name:"zhiyin",sex:"M"})
db.students.insert({name:"kaige",age:16})
db.students.insert({name:"yejie",age:16,sex:"F"})
db.students.insert({name:"boduo",age:18})
db.students.insert({name:"cunshang",age:15,sex:"F"})
db.students.insert({name:"dubian",age:18,sex:"F"})
结果:
{"age":15,"coount":1}
{"age":16,"coount":3}
{"age":18,"coount":2}
思路:
MongoDBInputFormat: 继承InputFormat,并实现两个核心方法:getSplits() 和 createRecordReader()
MongoDBOutputFormat:继承outputFormat,并实现三个核心方法:getRecordWriter()、checkOutputSpecs()、getOutputCommitter()
MongoDBWritable : Write(dbCollections db) readFields(dbObject d)
自定义MongoInputFormat
package MongoDBMR;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.util.ReflectionUtils;
import com.mongodb.DB;
import com.mongodb.DBAddress;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import com.mongodb.DBObject;
import com.mongodb.Mongo;
/**
* 自定义mongodb的输入格式化器
* @author gpy
*
*/
public class MongoDBInputFormat<V extends MongoDBWritable> extends InputFormat<LongWritable, V>{
/**
* 自定义输入数据分片信息类
* @author gpy
*
*/
public static class MongoDBInputSplit extends InputSplit implements Writable{
private long start = 0; //分片的起始位置
private long end = 0; //分片的结束位置
public MongoDBInputSplit(){
}
public MongoDBInputSplit(long start, long end) {
super();
this.start = start;
this.end = end;
}
public void write(DataOutput out) throws IOException {
out.writeLong(start);
out.writeLong(end);
}
public void readFields(DataInput in) throws IOException {
this.start = in.readLong();
this.end = in.readLong();
}
@Override
public long getLength() throws IOException, InterruptedException {
return end - start;
}
@Override
public String[] getLocations() throws IOException, InterruptedException {
return new String[0];
//return new String[] {};
}
}
/**
* 获取分片(将mongodb里面的数据取出来进行分片)
*/
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException,
InterruptedException {
//获取mongodb的连接
DB db = Mongo.connect(new DBAddress("192.168.159.100", "hadoop"));
//获取集合
DBCollection dbCollection = db.getCollection("students");
//定义分片大小,多少条数据一个分片
long chunk = 2;
//获取mongdb的collection的总的记录数
long count = dbCollection.count();
//计算分片有多少个
long chunksize = (count / chunk);
//定义一个集合存储分片
List<InputSplit> li = new ArrayList<InputSplit>();
//循环分片,注意不能刚好分片
for (int i = 0; i < chunksize; i++) {
/*
* 1-2
* 3-4
* 5-6
*/
MongoDBInputSplit is = null;
if(i+1 == chunksize){
is = new MongoDBInputSplit(i*chunk, count); //
li.add(is);
} else {
is = new MongoDBInputSplit(i*chunk, i*chunk + chunk);
li.add(is);
}
}
return li;
}
/**
* 自定义一个Null类型
* @author gpy
*
*/
public static class NULLMongoDBWritable implements MongoDBWritable{
public void write(DataOutput out) throws IOException {
}
public void readFields(DataInput in) throws IOException {
}
public void write(DBCollection dbCollection) {
}
public void readFields(DBObject dbObject) {
}
}
/**
* 自定义分片记录器
* @author gpy
*
* @param <V>
*/
public static class MongoDBRecordReader<V extends MongoDBWritable> extends RecordReader<LongWritable, V>{
//分片信息
private MongoDBInputSplit split;
//结果集(游标)
private DBCursor dbcursor;
//定义索引
private int index;
private LongWritable key;
private V value;
public MongoDBRecordReader(){
}
/**
*
* @param split
* @param context
* @throws InterruptedException
* @throws IOException
*/
public MongoDBRecordReader(InputSplit split,TaskAttemptContext context) throws IOException, InterruptedException{
super();
initialize(split,context);
}
public MongoDBRecordReader(MongoDBInputSplit split, DBCursor dbcursor,
int index, LongWritable key, V value) {
super();
this.split = split;
this.dbcursor = dbcursor;
this.index = index;
this.key = key;
this.value = value;
}
/**
* 初始化
*/
@Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
//初始化分片
this.split = (MongoDBInputSplit) split;
//初始化key
key = new LongWritable();
//初始化类
Configuration conf = context.getConfiguration();
Class classz = conf.getClass("mapred.mongo.split.value.class", NULLMongoDBWritable.class);
//初始化value值
value = (V) ReflectionUtils.newInstance(classz, conf);
}
/**
* 获取下一个keyvalue值
*/
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
//判断dbcursor是否为null
if(this.dbcursor == null){
//获取dbcursor的值
DB db = Mongo.connect(new DBAddress("192.168.159.100", "hadoop"));
//获取集合
DBCollection dbCollection = db.getCollection("students");
//获取游标
dbcursor = dbCollection.find().skip((int)this.split.start).limit((int)this.split.getLength());
}
//操作游标
boolean hasNext = this.dbcursor.hasNext();
if(hasNext){
//获取游标的下一个值
DBObject dbObject = this.dbcursor.next();
//下一个的key
this.key.set(this.split.start+index);
index ++;
//下一个value
this.value.readFields(dbObject);
}
return hasNext;
}
@Override
public LongWritable getCurrentKey() throws IOException,
InterruptedException {
return this.key;
}
@Override
public V getCurrentValue() throws IOException, InterruptedException {
return this.value;
}
/**
* 创建记录的进度
*/
@Override
public float getProgress() throws IOException, InterruptedException {
return 0;
}
/**
*关闭之前开启的对象
*/
@Override
public void close() throws IOException {
dbcursor.close();
}
}
@Override
public RecordReader<LongWritable, V> createRecordReader(InputSplit split,
TaskAttemptContext context) throws IOException,
InterruptedException {
/**
* 创建输入记录器
*/
return new MongoDBRecordReader<>(split,context);
}
}
自定义OutPutFormat
package MongoDBMR;
import java.io.IOException;
import java.net.UnknownHostException;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import com.mongodb.DB;
import com.mongodb.DBAddress;
import com.mongodb.DBCollection;
import com.mongodb.Mongo;
/**
* 自定义输出格式化器
* @author gpy
*
*/
public class MongoDBOutPutFormat<V extends MongoDBWritable> extends OutputFormat<NullWritable, V>{
/**
*
* @author gpy
*
* @param <V>
*/
public static class MongoDBRecordWriter<V extends MongoDBWritable> extends RecordWriter<NullWritable, V>{
public DBCollection dbCollection = null;
public MongoDBRecordWriter() {
}
public MongoDBRecordWriter(TaskAttemptContext context){
try {
//获取mongodb的连接
DB db = Mongo.connect(new DBAddress("192.168.159.100", "hadoop"));
dbCollection = db.getCollection("result");
} catch (UnknownHostException e) {
e.printStackTrace();
}
}
@Override
public void write(NullWritable key, V value) throws IOException,
InterruptedException {
/**
* 使用value 的write。本质是使用MongoDBWritable的write()
*/
value.write(this.dbCollection);
}
@Override
public void close(TaskAttemptContext context) throws IOException,
InterruptedException {
//do nothing
}
}
@Override
public RecordWriter<NullWritable, V> getRecordWriter(
TaskAttemptContext context) throws IOException,
InterruptedException {
//使用泛型为空的时候需要使用 jdk 1.7
return new MongoDBRecordWriter<>(context);
}
@Override
public void checkOutputSpecs(JobContext context) throws IOException,
InterruptedException {
//do nothing
}
/**
* 输出对象的提交
*/
@Override
public OutputCommitter getOutputCommitter(TaskAttemptContext context)
throws IOException, InterruptedException {
/**
* 没有输出文件路径 ,为null则可以
*/
return new FileOutputCommitter(null, context);
}
}
自定义MongoDBWritable接口
package MongoDBMR;
import org.apache.hadoop.io.Writable;
import com.mongodb.DBCollection;
import com.mongodb.DBObject;
/**
* 自定义数据类型的方法
* @author gpy
*
*/
public interface MongoDBWritable extends Writable {
/**
* 向mongodb写数据的对象
*/
public void write(DBCollection dbCollection);
/**
* 读mongodb表里面的数据对象
*/
public void readFields(DBObject dbObject);
}
自定义数据类型
package MongoDBMR;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import com.mongodb.BasicDBObjectBuilder;
import com.mongodb.DBCollection;
import com.mongodb.DBObject;
/**
* 自定义数据类型
* @author gpy
*
*/
public class PersonsMongoDBWritable implements MongoDBWritable{
public String name;
public Integer age;
public String sex;
public Integer counter = 1;
/**
* 注意处理空的值
*/
public void write(DataOutput out) throws IOException {
out.writeUTF(name);
if(age == null || sex == null){
out.writeBoolean(false);
} else {
out.writeBoolean(true);
out.writeInt(age);
out.writeUTF(sex);
}
out.writeInt(counter);
}
public void readFields(DataInput in) throws IOException {
this.name = in.readUTF();
if(in.readBoolean()){
this.sex = in.readUTF();
this.age = in.readInt();
} else {
this.sex = null;
this.age = null;
}
this.counter = in.readInt();
}
/**
* 写数据
*/
public void write(DBCollection dbCollection) {
DBObject dbObject = BasicDBObjectBuilder.start().add("age", this.age).add("counter",this.counter).get();
//将dbobject插入
dbCollection.insert(dbObject);
}
/**
* 读数据
*/
public void readFields(DBObject dbObject) {
this.name = dbObject.get("name").toString();
if(dbObject.get("age") != null){
this.age = Double.valueOf(dbObject.get("age").toString()).intValue();
} else {
this.age = null;
}
}
/**
* @return the name
*/
public String getName() {
return name;
}
/**
* @param name the name to set
*/
public void setName(String name) {
this.name = name;
}
/**
* @return the age
*/
public Integer getAge() {
return age;
}
/**
* @param age the age to set
*/
public void setAge(Integer age) {
this.age = age;
}
/**
* @return the sex
*/
public String getSex() {
return sex;
}
/**
* @param sex the sex to set
*/
public void setSex(String sex) {
this.sex = sex;
}
/**
* @return the counter
*/
public Integer getCounter() {
return counter;
}
/**
* @param counter the counter to set
*/
public void setCounter(Integer counter) {
this.counter = counter;
}
}