MR整合MongoDB

注意重写连接方式
本文引用的是MongoDB-2.9.2的依赖,3.0以后连接数据库的方式不一样了,注意更改,或者导入2.9.2的包也行

1. 导入依赖pom.xml
 <dependency>
            <groupId>org.mongodb</groupId>
            <artifactId>mongo-java-driver</artifactId>
            <version>2.9.2</version>
        </dependency>

数据:

db.students.insert({name:"bingbing",age:16,sex:"F"})
db.students.insert({name:"zhiyin",sex:"M"})
db.students.insert({name:"kaige",age:16})
db.students.insert({name:"yejie",age:16,sex:"F"})
db.students.insert({name:"boduo",age:18})
db.students.insert({name:"cunshang",age:15,sex:"F"})
db.students.insert({name:"dubian",age:18,sex:"F"})

结果:

{"age":15,"coount":1}
{"age":16,"coount":3}
{"age":18,"coount":2}

思路:

MongoDBInputFormat: 继承InputFormat,并实现两个核心方法:getSplits() 和 createRecordReader()
MongoDBOutputFormat:继承outputFormat,并实现三个核心方法:getRecordWriter()、checkOutputSpecs()、getOutputCommitter()
MongoDBWritable : Write(dbCollections db)   readFields(dbObject d)

自定义MongoInputFormat

package MongoDBMR;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.util.ReflectionUtils;

import com.mongodb.DB;
import com.mongodb.DBAddress;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import com.mongodb.DBObject;
import com.mongodb.Mongo;

/**
 * 自定义mongodb的输入格式化器
 * @author gpy
 *
 */

public class MongoDBInputFormat<V extends MongoDBWritable> extends InputFormat<LongWritable, V>{

	
	/**
	 * 自定义输入数据分片信息类
	 * @author gpy
	 *
	 */
	public static class MongoDBInputSplit extends InputSplit implements Writable{

		private long start = 0; //分片的起始位置
		private long end = 0; //分片的结束位置
		
		public MongoDBInputSplit(){
			
		}
		
		public MongoDBInputSplit(long start, long end) {
			super();
			this.start = start;
			this.end = end;
		}

		public void write(DataOutput out) throws IOException {
			out.writeLong(start);
			out.writeLong(end);
		}

		public void readFields(DataInput in) throws IOException {
			this.start = in.readLong();
			this.end = in.readLong();
		}

		@Override
		public long getLength() throws IOException, InterruptedException {
			return end - start;
		}

		@Override
		public String[] getLocations() throws IOException, InterruptedException {
			return new String[0];
			//return new String[] {};
		}
	}
	
	/**
	 * 获取分片(将mongodb里面的数据取出来进行分片)
	 */
	@Override
	public List<InputSplit> getSplits(JobContext context) throws IOException,
			InterruptedException {
		//获取mongodb的连接
		DB db =  Mongo.connect(new DBAddress("192.168.159.100", "hadoop"));
		//获取集合
		DBCollection dbCollection = db.getCollection("students");
		//定义分片大小,多少条数据一个分片
		long chunk = 2;
		//获取mongdb的collection的总的记录数
		long count = dbCollection.count();
		//计算分片有多少个
		long chunksize = (count / chunk);
		
		//定义一个集合存储分片
		List<InputSplit> li = new ArrayList<InputSplit>();
		
		//循环分片,注意不能刚好分片
		for (int i = 0; i < chunksize; i++) {
			/*
			 * 1-2
			 * 3-4
			 * 5-6
			 */
			MongoDBInputSplit is = null;
			if(i+1 == chunksize){
				is = new MongoDBInputSplit(i*chunk, count); //
				li.add(is);
			} else {
				is = new MongoDBInputSplit(i*chunk, i*chunk + chunk);
				li.add(is);
			}
		}
		return li;
	}

	/**
	 * 自定义一个Null类型
	 * @author gpy
	 *
	 */
	public static class NULLMongoDBWritable implements MongoDBWritable{

		public void write(DataOutput out) throws IOException {
		}

		public void readFields(DataInput in) throws IOException {
		}

		public void write(DBCollection dbCollection) {
		}

		public void readFields(DBObject dbObject) {
		}
	}
	
	/**
	 * 自定义分片记录器
	 * @author gpy
	 *
	 * @param <V>
	 */
	public static class MongoDBRecordReader<V extends MongoDBWritable> extends RecordReader<LongWritable, V>{
		//分片信息
		private MongoDBInputSplit split;
		//结果集(游标)
		private DBCursor dbcursor;
		//定义索引
		private int index;
		private LongWritable key;
		private V value;
		
		public MongoDBRecordReader(){
			
		}
		
		/**
		 * 
		 * @param split
		 * @param context
		 * @throws InterruptedException 
		 * @throws IOException 
		 */
		public MongoDBRecordReader(InputSplit split,TaskAttemptContext context) throws IOException, InterruptedException{
			super();
			initialize(split,context);
		}
		

		public MongoDBRecordReader(MongoDBInputSplit split, DBCursor dbcursor,
				int index, LongWritable key, V value) {
			super();
			this.split = split;
			this.dbcursor = dbcursor;
			this.index = index;
			this.key = key;
			this.value = value;
		}

		/**
		 * 初始化
		 */
		@Override
		public void initialize(InputSplit split, TaskAttemptContext context)
				throws IOException, InterruptedException {
			//初始化分片
			this.split = (MongoDBInputSplit) split;
			//初始化key
			key =  new LongWritable();
			//初始化类
			Configuration conf = context.getConfiguration();
			Class classz = conf.getClass("mapred.mongo.split.value.class", NULLMongoDBWritable.class);
			//初始化value值
			value = (V) ReflectionUtils.newInstance(classz, conf);
		}

		/**
		 * 获取下一个keyvalue值
		 */
		@Override
		public boolean nextKeyValue() throws IOException, InterruptedException {
			//判断dbcursor是否为null
			if(this.dbcursor == null){
				//获取dbcursor的值
				DB db = Mongo.connect(new DBAddress("192.168.159.100", "hadoop"));
				//获取集合
				DBCollection dbCollection = db.getCollection("students");
				//获取游标
				dbcursor = dbCollection.find().skip((int)this.split.start).limit((int)this.split.getLength());
			}
			//操作游标
			boolean hasNext = this.dbcursor.hasNext();
			if(hasNext){
				//获取游标的下一个值
				DBObject dbObject = this.dbcursor.next();
				//下一个的key
				this.key.set(this.split.start+index);
				index ++;
				//下一个value
				this.value.readFields(dbObject);
			}
			return hasNext;
		}

		@Override
		public LongWritable getCurrentKey() throws IOException,
				InterruptedException {
			return this.key;
		}

		@Override
		public V getCurrentValue() throws IOException, InterruptedException {
			return this.value;
		}

		/**
		 * 创建记录的进度
		 */
		@Override
		public float getProgress() throws IOException, InterruptedException {
			return 0;
		}

		/**
		 *关闭之前开启的对象 
		 */
		@Override
		public void close() throws IOException {
			dbcursor.close();
		}
	}
	
	
	@Override
	public RecordReader<LongWritable, V> createRecordReader(InputSplit split,
			TaskAttemptContext context) throws IOException,
			InterruptedException {
		/**
		 * 创建输入记录器
		 */
		return new MongoDBRecordReader<>(split,context);
	}
	
}

自定义OutPutFormat

package MongoDBMR;

import java.io.IOException;
import java.net.UnknownHostException;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import com.mongodb.DB;
import com.mongodb.DBAddress;
import com.mongodb.DBCollection;
import com.mongodb.Mongo;

/**
 * 自定义输出格式化器
 * @author gpy
 *
 */
public class MongoDBOutPutFormat<V extends MongoDBWritable> extends OutputFormat<NullWritable, V>{

	/**
	 * 
	 * @author gpy
	 *
	 * @param <V>
	 */
	public static class MongoDBRecordWriter<V extends MongoDBWritable> extends RecordWriter<NullWritable, V>{
		public DBCollection dbCollection = null;
		
		public MongoDBRecordWriter() {
		}
		
		public MongoDBRecordWriter(TaskAttemptContext context){
			try {
				//获取mongodb的连接
				DB db = Mongo.connect(new DBAddress("192.168.159.100", "hadoop"));
				dbCollection = db.getCollection("result");
			} catch (UnknownHostException e) {
				e.printStackTrace();
			}
		}
		
		@Override
		public void write(NullWritable key, V value) throws IOException,
				InterruptedException {
			/**
			 * 使用value 的write。本质是使用MongoDBWritable的write()
			 */
			value.write(this.dbCollection);
		}

		@Override
		public void close(TaskAttemptContext context) throws IOException,
				InterruptedException {
			//do nothing
		}
		
	}
	
	
	@Override
	public RecordWriter<NullWritable, V> getRecordWriter(
			TaskAttemptContext context) throws IOException,
			InterruptedException {
		//使用泛型为空的时候需要使用 jdk 1.7
		return new MongoDBRecordWriter<>(context);
	}

	@Override
	public void checkOutputSpecs(JobContext context) throws IOException,
			InterruptedException {
		//do nothing
	}

	/**
	 * 输出对象的提交
	 */
	@Override
	public OutputCommitter getOutputCommitter(TaskAttemptContext context)
			throws IOException, InterruptedException {
		/**
		 * 没有输出文件路径   ,为null则可以
		 */
		return new FileOutputCommitter(null, context);
	}
}

自定义MongoDBWritable接口

package MongoDBMR;

import org.apache.hadoop.io.Writable;

import com.mongodb.DBCollection;
import com.mongodb.DBObject;

/**
 * 自定义数据类型的方法
 * @author gpy
 *
 */
public interface MongoDBWritable extends Writable {
	 /**
	  * 向mongodb写数据的对象
	   */
		public void write(DBCollection dbCollection);
		
		/**
		 * 读mongodb表里面的数据对象
		 */
		public void readFields(DBObject dbObject); 
}

自定义数据类型

package MongoDBMR;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import com.mongodb.BasicDBObjectBuilder;
import com.mongodb.DBCollection;
import com.mongodb.DBObject;

/**
 * 自定义数据类型
 * @author gpy
 *
 */
public class PersonsMongoDBWritable implements MongoDBWritable{

	public String name;
	public Integer age;
	public String sex;
	public Integer counter = 1;
	

	/**
	 * 注意处理空的值
	 */
	public void write(DataOutput out) throws IOException {
		out.writeUTF(name);
		if(age == null || sex == null){
			out.writeBoolean(false);
		} else {
			out.writeBoolean(true);
			out.writeInt(age);
			out.writeUTF(sex);
		}
		out.writeInt(counter);
	}

	public void readFields(DataInput in) throws IOException {
		this.name = in.readUTF();
		if(in.readBoolean()){
			this.sex = in.readUTF();
			this.age = in.readInt();
		} else {
			this.sex = null;
			this.age = null;
		}
		this.counter = in.readInt();
				
	}

	/**
	 * 写数据
	 */
	public void write(DBCollection dbCollection) {
		DBObject dbObject = BasicDBObjectBuilder.start().add("age", this.age).add("counter",this.counter).get();
		//将dbobject插入
		dbCollection.insert(dbObject);
	}

	/**
	 * 读数据
	 */
	public void readFields(DBObject dbObject) {
		this.name = dbObject.get("name").toString();
		if(dbObject.get("age") != null){
			this.age = Double.valueOf(dbObject.get("age").toString()).intValue();
		} else {
			this.age = null;
		}
	}
	
	/**
	 * @return the name
	 */
	public String getName() {
		return name;
	}

	/**
	 * @param name the name to set
	 */
	public void setName(String name) {
		this.name = name;
	}

	/**
	 * @return the age
	 */
	public Integer getAge() {
		return age;
	}

	/**
	 * @param age the age to set
	 */
	public void setAge(Integer age) {
		this.age = age;
	}

	/**
	 * @return the sex
	 */
	public String getSex() {
		return sex;
	}

	/**
	 * @param sex the sex to set
	 */
	public void setSex(String sex) {
		this.sex = sex;
	}

	/**
	 * @return the counter
	 */
	public Integer getCounter() {
		return counter;
	}

	/**
	 * @param counter the counter to set
	 */
	public void setCounter(Integer counter) {
		this.counter = counter;
	}

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值