MongoDB 分组, 去重(转)

原文 http://my.oschina.net/huzorro/blog/73879


文档结构如下:

?
1
2
3
4
5
6
7
8
9
Spcode
Spname
Consignid
Consname
Region
Regionname
Serviceid
Servicename
Srctermid

一个月数据量大约1000w左右。

要实现任意字段的组合, 分组,  对Srctermid做去重操作。

MySQL:

?
1
2
SELECT Spcode, Spname, Consignid, Consname, COUNT ( DISTINCT (Srctermid))
FROM mt_log_201208 GROUP BY Spcode, Spname, Consignid, Consname

mapreduce: 

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
res = db.runCommand({
     mapreduce: 'mo_log_201208' ,
     query:{Logtime:{$gte: '20120801' , $lte: '20120821' }},
     map: function () {
         emit({Spcode:this.Spcode, Spname:this.Spname,
         Consignid:this.Consignid, Consname:this.Consname},
         { "data" :[{ "Srctermid" :this.Srctermid}]});
     },
     reduce: function ( key , value) {
         var ret = {data:[]};
         var srctermid = {};
         for (var i in value) {
             var ia = value[i];         
             for (var j in ia.data) {
                 if(!srctermid[ia.data[j].Srctermid]) {
                     srctermid[ia.data[j].Srctermid] = true ;
                     ret.data.push(ia.data[j]);
                 }
             }
         }
         return ret;
     },
     finalize: function ( key , values ){
         return { count : values .data.length};
     },
     out : 'tmp_mo_spcode_consignid_1' ,
     verbose: true
})
?
1
2
3
4
5
6
7
8
9
10
11
12
> db[res.result].findOne();
{
     "_id" : {
         "Spcode" : "20017" ,
         "Spname" : "es" ,
         "Consignid" : "000000" ,
         "Consname" : "pa"
     },
     "value" : {
         "count" : 978
     }
}

 因为是对任意字段的组合分组, 对Srctermid字段去重, 为了降低每次查询扫描的记录数, 可以按照完整条件, 生成中间结果集。

mapreduce:

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
res = db.runCommand({
     mapreduce: 'mo_log_201208' ,
     query:{Logtime:{$gte: '20120801' , $lte: '20120821' }},
     map: function () {
         emit({Spcode:this.Spcode, Spname:this.Spname,
                         Consignid:this.Consignid, Consname:this.Consname,
                         Region:this.Region, Regionname:this.Regionname,
                         Serviceid:this.Serviceid, Servicename:this.Servicename},
                         { "data" :[{ "Srctermid" :this.Srctermid}]});
     },
     reduce: function ( key , value) {
         var ret = {data:[]};
         var srctermid = {};
         for (var i in value) {
             var ia = value[i];         
             for (var j in ia.data) {
                 if(!srctermid[ia.data[j].Srctermid]) {
                     srctermid[ia.data[j].Srctermid] = true ;
                     ret.data.push(ia.data[j]);
                 }
             }
         }
         return ret;
     },
     out : 'tmp_mo_spcode_consignid_region_serviceid_1' ,
     verbose: true
})
?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
> db.tmp_mo_spcode_consignid_region_serviceid_20120819_1.findOne();
{
     "_id" : {
         "Spcode" : "20017" ,
         "Spname" : "es" ,
         "Consignid" : "000000" ,
         "Consname" : "pa" ,
         "Region" : "0000" ,
         "Regionname" : "sa" ,
         "Serviceid" : "BZ" ,
         "Servicename" : "aa"
     },
     "value" : {
         "data" : [
             {
                 "Srctermid" : "1864741xxxx"
             },
             {
                 "Srctermid" : "1862370xxxx"
             },
             {
                 "Srctermid" : "1862061xxxx"
             },
         ]
     }
}
?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
res = db.runCommand({
     mapreduce: 'tmp_mo_spcode_consignid_region_serviceid_20120819_1' ,
     map: function () {
         emit({Spcode:this._id.Spcode, Spname:this._id.Spname,
                         Consignid:this._id.Consignid, Consname:this._id.Consname},
                         { "data" :this.value.data});
     },
     reduce: function ( key , value) {
         var ret = {data:[]};
         var srctermid = {};
         for (var i in value) {
             var ia = value[i];         
             for (var j in ia.data) {
                 if(!srctermid[ia.data[j].Srctermid]) {
                     srctermid[ia.data[j].Srctermid] = true ;
                     ret.data.push(ia.data[j]);
                 }
             }
         }
         return ret;
     },
     finalize: function ( key , values ){
         return { count : values .data.length};
     },
     verbose: true
})
?
1
2
3
4
5
6
7
8
9
10
11
12
> db[res.result].findOne();
{
     "_id" : {
         "Spcode" : "20017" ,
         "Spname" : "es" ,
         "Consignid" : "000000" ,
         "Consname" : "pa"
     },
     "value" : {
         "count" : 978
     }
}

mongodb对单个文档大小的限制是>v1.8版本的16MB, <v1.8的版本4MB

mongodb 单表最大索引数为64

无索引排序的最大数据量为4M, 超过则报错退出。

在上面的操作中, 如果reduce的values超过这个限制会报错退出, 严重的mongodb服务直接dbexit

?
1
2
3
4
5
6
{
     "assertion" : "invoke failed: JS Error: out of memory nofile_b:5" ,
     "assertionCode" : 9004,
     "errmsg" : "db assertion failure" ,
     "ok" : 0
}
那么我们采用下面的方式来实现:先去重->再分组
?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
res = db.runCommand({
     mapreduce: 'mo_log_201208' ,
     map: function () {
         emit({Spcode:this.Spcode, Spname:this.Spname,
                         Consignid:this.Consignid, Consname:this.Consname,
                         Srctermid:this.Srctermid}, { count :1});
     },
     reduce: function ( key , value) {
         var ret = { count :0};
         ret. count ++;
         return ret;
     },
     out : 'tmp_mo_spcode_consignid_region_serviceid_1' ,
     verbose: true
})
?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
res = db.runCommand({
     mapreduce: 'tmp_spcode_consignid_region_serviceid_1' ,
     map: function () {
         emit({Spcode:this._id.Spcode, Spname:this._id.Spname,       
                         Consignid:this._id.Consignid, Srct:this._id.Consname},
                        { count :this.value. count });
     },
     reduce: function ( key , value) {
         var ret = { count :0};
         for (var i in value) {
             ret. count += value[i]. count ;
         }
         return ret;
     },
     verbose: true
})
同样为了减少每次查询扫描的记录数量, 可以生成中间结果集
?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
res = db.runCommand({
     mapreduce: 'mo_log_201208' ,
     map: function () {
         emit({Spcode:this.Spcode, Spname:this.Spname,
                         Consignid:this.Consignid, Consname:this.Consname,
                         Region:this.Region, Regionname:this.Regionname,
                         Serviceid:this.Serviceid, Servicename:this.Servicename,
                         Srctermid:this.Srctermid}, { count :1});
     },
     reduce: function ( key , value) {
         var ret = { count :0};
         ret. count ++;
         return ret;
     },
     out : 'tmp_mo_spcode_consignid_region_serviceid_1' ,
     verbose: true
})
?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
res = db.runCommand({
     mapreduce: 'tmp_mo_spcode_consignid_region_serviceid_1' ,
     map: function () {
         emit({Spcode:this._id.Spcode, Spname:this._id.Spname,
                         Consignid:this._id.Consignid, Consname:this._id.Consname,
                         Srctermid:this._id.Srctermid}, { count :1});
     },
     reduce: function ( key , value) {
         var ret = { count :0};
         ret. count ++;
         return ret;
     },
     out : 'tmp_mo_spcode_consignid_region_serviceid_2' ,
     verbose: true
})
?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
res = db.runCommand({
     mapreduce: 'tmp_spcode_consignid_region_serviceid_2' ,
     map: function () {
         emit({Spcode:this._id.Spcode, Spname:this._id.Spname,       
                         Consignid:this._id.Consignid, Srct:this._id.Consname},
                        { count :this.value. count });
     },
     reduce: function ( key , value) {
         var ret = { count :0};
         for (var i in value) {
             ret. count += value[i]. count ;
         }
         return ret;
     },
     verbose: true
})
?
1
2
3
4
5
6
7
8
9
10
11
12
> db[res.result].findOne();
{
     "_id" : {
         "Spcode" : "20017" ,
         "Spname" : "es" ,
         "Consignid" : "000000" ,
         "Consname" : "pa"
     },
     "value" : {
         "count" : 978
     }
}

 总结:mongodb 对map/reduce 的支持是单线程的,   灰常消耗系统资源, 避免在前台使用。

 送上萌妹之音一枚~~~~

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值