文档结构如下:
Spcode
Spname
Consignid
Consname
Region
Regionname
Serviceid
Servicename
Srctermid
一个月数据量大约1000w左右。
要实现任意字段的组合, 分组, 对Srctermid做去重操作。
MySQL:
SELECT Spcode, Spname, Consignid, Consname, COUNT(DISTINCT(Srctermid))
FROM mt_log_201208 GROUP BY Spcode, Spname, Consignid, Consname
mapreduce:
res = db.runCommand({
mapreduce:'mo_log_201208',
query:{Logtime:{$gte:'20120801', $lte:'20120821'}},
map:function() {
emit({Spcode:this.Spcode, Spname:this.Spname,
Consignid:this.Consignid, Consname:this.Consname},
{"data":[{"Srctermid":this.Srctermid}]});
},
reduce:function(key, value) {
var ret = {data:[]};
var srctermid = {};
for(var i in value) {
var ia = value[i];
for(var j in ia.data) {
if(!srctermid[ia.data[j].Srctermid]) {
srctermid[ia.data[j].Srctermid] = true;
ret.data.push(ia.data[j]);
}
}
}
return ret;
},
finalize:function(key, values){
return {count:values.data.length};
},
out:'tmp_mo_spcode_consignid_1',
verbose:true
})
> db[res.result].findOne();
{
"_id" : {
"Spcode" : "20017",
"Spname" : "es",
"Consignid" : "000000",
"Consname" : "pa"
},
"value" : {
"count" : 978
}
}
因为是对任意字段的组合分组, 对Srctermid字段去重, 为了降低每次查询扫描的记录数, 可以按照完整条件, 生成中间结果集。
mapreduce:
res = db.runCommand({
mapreduce:'mo_log_201208',
query:{Logtime:{$gte:'20120801', $lte:'20120821'}},
map:function() {
emit({Spcode:this.Spcode, Spname:this.Spname,
Consignid:this.Consignid, Consname:this.Consname,
Region:this.Region, Regionname:this.Regionname,
Serviceid:this.Serviceid, Servicename:this.Servicename},
{"data":[{"Srctermid":this.Srctermid}]});
},
reduce:function(key, value) {
var ret = {data:[]};
var srctermid = {};
for(var i in value) {
var ia = value[i];
for(var j in ia.data) {
if(!srctermid[ia.data[j].Srctermid]) {
srctermid[ia.data[j].Srctermid] = true;
ret.data.push(ia.data[j]);
}
}
}
return ret;
},
out:'tmp_mo_spcode_consignid_region_serviceid_1',
verbose:true
})
> db.tmp_mo_spcode_consignid_region_serviceid_20120819_1.findOne();
{
"_id" : {
"Spcode" : "20017",
"Spname" : "es",
"Consignid" : "000000",
"Consname" : "pa",
"Region" : "0000",
"Regionname" : "sa",
"Serviceid" : "BZ",
"Servicename" : "aa"
},
"value" : {
"data" : [
{
"Srctermid" : "1864741xxxx"
},
{
"Srctermid" : "1862370xxxx"
},
{
"Srctermid" : "1862061xxxx"
},
]
}
}
res = db.runCommand({
mapreduce:'tmp_mo_spcode_consignid_region_serviceid_20120819_1',
map:function() {
emit({Spcode:this._id.Spcode, Spname:this._id.Spname,
Consignid:this._id.Consignid, Consname:this._id.Consname},
{"data":this.value.data});
},
reduce:function(key, value) {
var ret = {data:[]};
var srctermid = {};
for(var i in value) {
var ia = value[i];
for(var j in ia.data) {
if(!srctermid[ia.data[j].Srctermid]) {
srctermid[ia.data[j].Srctermid] = true;
ret.data.push(ia.data[j]);
}
}
}
return ret;
},
finalize:function(key, values){
return {count:values.data.length};
},
verbose:true
})
> db[res.result].findOne();
{
"_id" : {
"Spcode" : "20017",
"Spname" : "es",
"Consignid" : "000000",
"Consname" : "pa"
},
"value" : {
"count" : 978
}
}
mongodb对单个文档大小的限制是>v1.8版本的16MB, <v1.8的版本4MB
mongodb 单表最大索引数为64
无索引排序的最大数据量为4M, 超过则报错退出。
在上面的操作中, 如果reduce的values超过这个限制会报错退出, 严重的mongodb服务直接dbexit
{
"assertion" : "invoke failed: JS Error: out of memory nofile_b:5",
"assertionCode" : 9004,
"errmsg" : "db assertion failure",
"ok" : 0
}
那么我们采用下面的方式来实现:先去重->再分组
res = db.runCommand({
mapreduce:'mo_log_201208',
map:function() {
emit({Spcode:this.Spcode, Spname:this.Spname,
Consignid:this.Consignid, Consname:this.Consname,
Srctermid:this.Srctermid}, {count:1});
},
reduce:function(key, value) {
var ret = {count:0};
ret.count++;
return ret;
},
out:'tmp_mo_spcode_consignid_region_serviceid_1',
verbose:true
})
res = db.runCommand({
mapreduce:'tmp_spcode_consignid_region_serviceid_1',
map:function() {
emit({Spcode:this._id.Spcode, Spname:this._id.Spname,
Consignid:this._id.Consignid, Srct:this._id.Consname},
{count:this.value.count});
},
reduce:function(key, value) {
var ret = {count:0};
for(var i in value) {
ret.count += value[i].count;
}
return ret;
},
verbose:true
})
同样为了减少每次查询扫描的记录数量, 可以生成中间结果集
res = db.runCommand({
mapreduce:'mo_log_201208',
map:function() {
emit({Spcode:this.Spcode, Spname:this.Spname,
Consignid:this.Consignid, Consname:this.Consname,
Region:this.Region, Regionname:this.Regionname,
Serviceid:this.Serviceid, Servicename:this.Servicename,
Srctermid:this.Srctermid}, {count:1});
},
reduce:function(key, value) {
var ret = {count:0};
ret.count++;
return ret;
},
out:'tmp_mo_spcode_consignid_region_serviceid_1',
verbose:true
})
res = db.runCommand({
mapreduce:'tmp_mo_spcode_consignid_region_serviceid_1',
map:function() {
emit({Spcode:this._id.Spcode, Spname:this._id.Spname,
Consignid:this._id.Consignid, Consname:this._id.Consname,
Srctermid:this._id.Srctermid}, {count:1});
},
reduce:function(key, value) {
var ret = {count:0};
ret.count++;
return ret;
},
out:'tmp_mo_spcode_consignid_region_serviceid_2',
verbose:true
})
res = db.runCommand({
mapreduce:'tmp_spcode_consignid_region_serviceid_2',
map:function() {
emit({Spcode:this._id.Spcode, Spname:this._id.Spname,
Consignid:this._id.Consignid, Srct:this._id.Consname},
{count:this.value.count});
},
reduce:function(key, value) {
var ret = {count:0};
for(var i in value) {
ret.count += value[i].count;
}
return ret;
},
verbose:true
})
> db[res.result].findOne();
{
"_id" : {
"Spcode" : "20017",
"Spname" : "es",
"Consignid" : "000000",
"Consname" : "pa"
},
"value" : {
"count" : 978
}
}
总结:mongodb 对map/reduce 的支持是单线程的, 灰常消耗系统资源, 避免在前台使用。
送上萌妹之音一枚~~~~