MapReduce是一种编程模型,用于大规模数据集(大于1TB)的并行运算。概念"Map(映射)"和"Reduce(化简)",和他们的主要思想,都是从函数式编程语言里借来的,还有从矢量编程语言里借来的特性。
1. 先来一个看一个简单的例子,利用mongodb 的MapReduce功能进行分组统计。
数据表结构,用户的行为Record表,用户每个行为记录一条,利用MapReduce来统计每个用户所有行为的总数。
{"user_id" : NumberLong(10027857), "action_type" : 9, "create_time" : NumberLong("1308330304520") }
{"user_id" : NumberLong(10027858), "action_type" : 7, "create_time" : NumberLong("1308330556146") }
{"user_id" : NumberLong(10027859), "action_type" : 5, "create_time" : NumberLong("1308330834340") }
{"user_id" : NumberLong(10027859), "action_type" : 8, "create_time" : NumberLong("1308330896718") }
{"user_id" : NumberLong(22937), "action_type" : 9, "create_time" : NumberLong("1308332535982") }
{"user_id" : NumberLong(22937), "action_type" : 8, "create_time" : NumberLong("1308332563006") }
先定义map函数:
m = function(){
emit( this.user_id, {count: 1} ); // count表示每遍历一条记录,增加的值,现表示遍历一条记录count加1,user_id表示key
};
再定义Reduce函数:
r = function(key, values) {
var result = {count: 0};
values.forEach(function(value) {
result.count += value.count;
});
return result;
};
执行 res = db.RecordModel.mapReduce(m, r, {out : {replace : 'things_reduce'}});
结果会出现在things_reduce临时表中,
最后执行 db.things_reduce.find(); 来查看结果。
执行结果:
> m = function(){
... emit( this.user_id, {count: 1} );
... };
function () {
emit(this.user_id, {count:1});
}
> r = function(key, values) {
... var result = {count: 0};
... values.forEach(function(value) {
... result.count += value.count;
... });
... return result;
... };
function (key, values) {
var result = {count:0};
values.forEach(function (value) {result.count += value.count;});
return result;
}
> res = db.RecordModel.mapReduce(m, r, {out : {replace : 'things_reduce'}});
{
"result" : "things_reduce",
"timeMillis" : 58032,
"counts" : {
"input" : 575113,
"emit" : 575113,
"output" : 19647
},
"ok" : 1,
}
> db.things_reduce.find();
{ "_id" : NumberLong(-10050025), "value" : { "count" : 4 } }
{ "_id" : NumberLong(1), "value" : { "count" : 15556 } }
{ "_id" : NumberLong(3), "value" : { "count" : 178 } }
{ "_id" : NumberLong(4), "value" : { "count" : 1649 } }
{ "_id" : NumberLong(5), "value" : { "count" : 422 } }
{ "_id" : NumberLong(7), "value" : { "count" : 627 } }
{ "_id" : NumberLong(9), "value" : { "count" : 125 } }
{ "_id" : NumberLong(10), "value" : { "count" : 871 } }
{ "_id" : NumberLong(72), "value" : { "count" : 12 } }
{ "_id" : NumberLong(1031), "value" : { "count" : 1 } }
{ "_id" : NumberLong(1032), "value" : { "count" : 1 } }
{ "_id" : NumberLong(1033), "value" : { "count" : 1 } }
{ "_id" : NumberLong(1034), "value" : { "count" : 2 } }
{ "_id" : NumberLong(1035), "value" : { "count" : 1 } }
{ "_id" : NumberLong(1038), "value" : { "count" : 1 } }
{ "_id" : NumberLong(1039), "value" : { "count" : 2 } }
{ "_id" : NumberLong(1041), "value" : { "count" : 19 } }
{ "_id" : NumberLong(1043), "value" : { "count" : 3 } }
{ "_id" : NumberLong(1044), "value" : { "count" : 2 } }
has more
---------------------------------------------------------------------------------------------------------------
以上是对所有数据进行统计的,可以可以实现对部分数据进行统计,如统计 最近三天的数据执行如下:
res = db.RecordModel.mapReduce(m, r, {out : {replace : 'things_reduce'}},{query:{"create_time":{$gt:1308332565762}}});
如果要对结果进行排序,最后执行查询临时表时,加上sort 就可以了。
db.things_reduce.find().sort({"value.count":-1});