MongoDB 分组, 去重

文档结构如下:

Spcode 
Spname
Consignid
Consname
Region
Regionname
Serviceid
Servicename
Srctermid

一个月数据量大约1000w左右。

要实现任意字段的组合, 分组,  对Srctermid做去重操作。

MySQL:

SELECT Spcode, Spname, Consignid, Consname, COUNT(DISTINCT(Srctermid))
FROM mt_log_201208 GROUP BY Spcode, Spname, Consignid, Consname

mapreduce: 

res = db.runCommand({
	mapreduce:'mo_log_201208',
	query:{Logtime:{$gte:'20120801', $lte:'20120821'}},
	map:function() {
		emit({Spcode:this.Spcode, Spname:this.Spname, 
		Consignid:this.Consignid, Consname:this.Consname}, 
		{"data":[{"Srctermid":this.Srctermid}]});
	},
	reduce:function(key, value) {
		var ret = {data:[]};
		var srctermid = {};
		for(var i in value) {
			var ia = value[i];			
			for(var j in ia.data) {
				if(!srctermid[ia.data[j].Srctermid]) {
					srctermid[ia.data[j].Srctermid] = true;
					ret.data.push(ia.data[j]);
				}
			}
		}
        return ret;
	},
	finalize:function(key, values){
		return {count:values.data.length};
	},
	out:'tmp_mo_spcode_consignid_1',
	verbose:true
})
> db[res.result].findOne();
{
	"_id" : {
		"Spcode" : "20017",
		"Spname" : "es",
		"Consignid" : "000000",
		"Consname" : "pa"
	},
	"value" : {
		"count" : 978
	}
}

 因为是对任意字段的组合分组, 对Srctermid字段去重, 为了降低每次查询扫描的记录数, 可以按照完整条件, 生成中间结果集。

mapreduce:

res = db.runCommand({
	mapreduce:'mo_log_201208',
	query:{Logtime:{$gte:'20120801', $lte:'20120821'}},
	map:function() {
		emit({Spcode:this.Spcode, Spname:this.Spname, 
                        Consignid:this.Consignid, Consname:this.Consname, 
                        Region:this.Region, Regionname:this.Regionname, 
                        Serviceid:this.Serviceid, Servicename:this.Servicename}, 
                        {"data":[{"Srctermid":this.Srctermid}]});
	},
	reduce:function(key, value) {
		var ret = {data:[]};
		var srctermid = {};
		for(var i in value) {
			var ia = value[i];			
			for(var j in ia.data) {
				if(!srctermid[ia.data[j].Srctermid]) {
					srctermid[ia.data[j].Srctermid] = true;
					ret.data.push(ia.data[j]);
				}
			}
		}
        return ret;
	},
	out:'tmp_mo_spcode_consignid_region_serviceid_1',
	verbose:true
})
> db.tmp_mo_spcode_consignid_region_serviceid_20120819_1.findOne();
{
	"_id" : {
		"Spcode" : "20017",
		"Spname" : "es",
		"Consignid" : "000000",
		"Consname" : "pa",
		"Region" : "0000",
		"Regionname" : "sa",
		"Serviceid" : "BZ",
		"Servicename" : "aa"
	},
	"value" : {
		"data" : [
			{
				"Srctermid" : "1864741xxxx"
			},
			{
				"Srctermid" : "1862370xxxx"
			},
			{
				"Srctermid" : "1862061xxxx"
			},
		]
	}
}
res = db.runCommand({
	mapreduce:'tmp_mo_spcode_consignid_region_serviceid_20120819_1',
	map:function() {
		emit({Spcode:this._id.Spcode, Spname:this._id.Spname,
                        Consignid:this._id.Consignid, Consname:this._id.Consname}, 
                        {"data":this.value.data});
	},
	reduce:function(key, value) {
		var ret = {data:[]};
		var srctermid = {};
		for(var i in value) {
			var ia = value[i];			
			for(var j in ia.data) {
				if(!srctermid[ia.data[j].Srctermid]) {
					srctermid[ia.data[j].Srctermid] = true;
					ret.data.push(ia.data[j]);
				}
			}
		}
        return ret;
	},
	finalize:function(key, values){
		return {count:values.data.length};
	},
	verbose:true
})
> db[res.result].findOne();
{
	"_id" : {
		"Spcode" : "20017",
		"Spname" : "es",
		"Consignid" : "000000",
		"Consname" : "pa"
	},
	"value" : {
		"count" : 978
	}
}

mongodb对单个文档大小的限制是>v1.8版本的16MB, <v1.8的版本4MB

mongodb 单表最大索引数为64

无索引排序的最大数据量为4M, 超过则报错退出。

在上面的操作中, 如果reduce的values超过这个限制会报错退出, 严重的mongodb服务直接dbexit

{
	"assertion" : "invoke failed: JS Error: out of memory nofile_b:5",
	"assertionCode" : 9004,
	"errmsg" : "db assertion failure",
	"ok" : 0
}
那么我们采用下面的方式来实现:先去重->再分组
res = db.runCommand({
	mapreduce:'mo_log_201208',
	map:function() {
		emit({Spcode:this.Spcode, Spname:this.Spname, 
                        Consignid:this.Consignid, Consname:this.Consname,
                        Srctermid:this.Srctermid}, {count:1});
	},
	reduce:function(key, value) {
		var ret = {count:0};
		ret.count++;
		return ret;
	},
	out:'tmp_mo_spcode_consignid_region_serviceid_1',
	verbose:true
})
res = db.runCommand({
	mapreduce:'tmp_spcode_consignid_region_serviceid_1',
	map:function() {
		emit({Spcode:this._id.Spcode, Spname:this._id.Spname,        
                        Consignid:this._id.Consignid, Srct:this._id.Consname}, 
                        {count:this.value.count});
	},
	reduce:function(key, value) {
		var ret = {count:0};
		for(var i in value) {
			ret.count += value[i].count;
		}
		return ret;
	},
	verbose:true
})
同样为了减少每次查询扫描的记录数量, 可以生成中间结果集
res = db.runCommand({
	mapreduce:'mo_log_201208',
	map:function() {
		emit({Spcode:this.Spcode, Spname:this.Spname,
                        Consignid:this.Consignid, Consname:this.Consname, 
                        Region:this.Region, Regionname:this.Regionname, 
                        Serviceid:this.Serviceid, Servicename:this.Servicename, 
                        Srctermid:this.Srctermid}, {count:1});
	},
	reduce:function(key, value) {
		var ret = {count:0};
		ret.count++;
		return ret;
	},
	out:'tmp_mo_spcode_consignid_region_serviceid_1',
	verbose:true
})
res = db.runCommand({
	mapreduce:'tmp_mo_spcode_consignid_region_serviceid_1',
	map:function() {
		emit({Spcode:this._id.Spcode, Spname:this._id.Spname, 
                        Consignid:this._id.Consignid, Consname:this._id.Consname,
                        Srctermid:this._id.Srctermid}, {count:1});
	},
	reduce:function(key, value) {
		var ret = {count:0};
		ret.count++;
		return ret;
	},
	out:'tmp_mo_spcode_consignid_region_serviceid_2',
	verbose:true
})
res = db.runCommand({
	mapreduce:'tmp_spcode_consignid_region_serviceid_2',
	map:function() {
		emit({Spcode:this._id.Spcode, Spname:this._id.Spname,        
                        Consignid:this._id.Consignid, Srct:this._id.Consname}, 
                        {count:this.value.count});
	},
	reduce:function(key, value) {
		var ret = {count:0};
		for(var i in value) {
			ret.count += value[i].count;
		}
		return ret;
	},
	verbose:true
})
> db[res.result].findOne();
{
	"_id" : {
		"Spcode" : "20017",
		"Spname" : "es",
		"Consignid" : "000000",
		"Consname" : "pa"
	},
	"value" : {
		"count" : 978
	}
}

 总结:mongodb 对map/reduce 的支持是单线程的,   灰常消耗系统资源, 避免在前台使用。

 送上萌妹之音一枚~~~~


转载于:https://my.oschina.net/huzorro/blog/73879

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值