原文 http://my.oschina.net/huzorro/blog/73879
文档结构如下:
1
2
3
4
5
6
7
8
9
|
Spcode
Spname
Consignid
Consname
Region
Regionname
Serviceid
Servicename
Srctermid
|
一个月数据量大约1000w左右。
要实现任意字段的组合, 分组, 对Srctermid做去重操作。
MySQL:
1
2
|
SELECT
Spcode, Spname, Consignid, Consname,
COUNT
(
DISTINCT
(Srctermid))
FROM
mt_log_201208
GROUP
BY
Spcode, Spname, Consignid, Consname
|
mapreduce:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
|
res = db.runCommand({
mapreduce:
'mo_log_201208'
,
query:{Logtime:{$gte:
'20120801'
, $lte:
'20120821'
}},
map:
function
() {
emit({Spcode:this.Spcode, Spname:this.Spname,
Consignid:this.Consignid, Consname:this.Consname},
{
"data"
:[{
"Srctermid"
:this.Srctermid}]});
},
reduce:
function
(
key
, value) {
var ret = {data:[]};
var srctermid = {};
for
(var i
in
value) {
var ia = value[i];
for
(var j
in
ia.data) {
if(!srctermid[ia.data[j].Srctermid]) {
srctermid[ia.data[j].Srctermid] =
true
;
ret.data.push(ia.data[j]);
}
}
}
return
ret;
},
finalize:
function
(
key
,
values
){
return
{
count
:
values
.data.length};
},
out
:
'tmp_mo_spcode_consignid_1'
,
verbose:
true
})
|
1
2
3
4
5
6
7
8
9
10
11
12
|
> db[res.result].findOne();
{
"_id"
: {
"Spcode"
:
"20017"
,
"Spname"
:
"es"
,
"Consignid"
:
"000000"
,
"Consname"
:
"pa"
},
"value"
: {
"count"
: 978
}
}
|
因为是对任意字段的组合分组, 对Srctermid字段去重, 为了降低每次查询扫描的记录数, 可以按照完整条件, 生成中间结果集。
mapreduce:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
|
res = db.runCommand({
mapreduce:
'mo_log_201208'
,
query:{Logtime:{$gte:
'20120801'
, $lte:
'20120821'
}},
map:
function
() {
emit({Spcode:this.Spcode, Spname:this.Spname,
Consignid:this.Consignid, Consname:this.Consname,
Region:this.Region, Regionname:this.Regionname,
Serviceid:this.Serviceid, Servicename:this.Servicename},
{
"data"
:[{
"Srctermid"
:this.Srctermid}]});
},
reduce:
function
(
key
, value) {
var ret = {data:[]};
var srctermid = {};
for
(var i
in
value) {
var ia = value[i];
for
(var j
in
ia.data) {
if(!srctermid[ia.data[j].Srctermid]) {
srctermid[ia.data[j].Srctermid] =
true
;
ret.data.push(ia.data[j]);
}
}
}
return
ret;
},
out
:
'tmp_mo_spcode_consignid_region_serviceid_1'
,
verbose:
true
})
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
|
> db.tmp_mo_spcode_consignid_region_serviceid_20120819_1.findOne();
{
"_id"
: {
"Spcode"
:
"20017"
,
"Spname"
:
"es"
,
"Consignid"
:
"000000"
,
"Consname"
:
"pa"
,
"Region"
:
"0000"
,
"Regionname"
:
"sa"
,
"Serviceid"
:
"BZ"
,
"Servicename"
:
"aa"
},
"value"
: {
"data"
: [
{
"Srctermid"
:
"1864741xxxx"
},
{
"Srctermid"
:
"1862370xxxx"
},
{
"Srctermid"
:
"1862061xxxx"
},
]
}
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
|
res = db.runCommand({
mapreduce:
'tmp_mo_spcode_consignid_region_serviceid_20120819_1'
,
map:
function
() {
emit({Spcode:this._id.Spcode, Spname:this._id.Spname,
Consignid:this._id.Consignid, Consname:this._id.Consname},
{
"data"
:this.value.data});
},
reduce:
function
(
key
, value) {
var ret = {data:[]};
var srctermid = {};
for
(var i
in
value) {
var ia = value[i];
for
(var j
in
ia.data) {
if(!srctermid[ia.data[j].Srctermid]) {
srctermid[ia.data[j].Srctermid] =
true
;
ret.data.push(ia.data[j]);
}
}
}
return
ret;
},
finalize:
function
(
key
,
values
){
return
{
count
:
values
.data.length};
},
verbose:
true
})
|
1
2
3
4
5
6
7
8
9
10
11
12
|
> db[res.result].findOne();
{
"_id"
: {
"Spcode"
:
"20017"
,
"Spname"
:
"es"
,
"Consignid"
:
"000000"
,
"Consname"
:
"pa"
},
"value"
: {
"count"
: 978
}
}
|
mongodb对单个文档大小的限制是>v1.8版本的16MB, <v1.8的版本4MB
mongodb 单表最大索引数为64
无索引排序的最大数据量为4M, 超过则报错退出。
在上面的操作中, 如果reduce的values超过这个限制会报错退出, 严重的mongodb服务直接dbexit
1
2
3
4
5
6
|
{
"assertion"
:
"invoke failed: JS Error: out of memory nofile_b:5"
,
"assertionCode"
: 9004,
"errmsg"
:
"db assertion failure"
,
"ok"
: 0
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
res = db.runCommand({
mapreduce:
'mo_log_201208'
,
map:
function
() {
emit({Spcode:this.Spcode, Spname:this.Spname,
Consignid:this.Consignid, Consname:this.Consname,
Srctermid:this.Srctermid}, {
count
:1});
},
reduce:
function
(
key
, value) {
var ret = {
count
:0};
ret.
count
++;
return
ret;
},
out
:
'tmp_mo_spcode_consignid_region_serviceid_1'
,
verbose:
true
})
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
res = db.runCommand({
mapreduce:
'tmp_spcode_consignid_region_serviceid_1'
,
map:
function
() {
emit({Spcode:this._id.Spcode, Spname:this._id.Spname,
Consignid:this._id.Consignid, Srct:this._id.Consname},
{
count
:this.value.
count
});
},
reduce:
function
(
key
, value) {
var ret = {
count
:0};
for
(var i
in
value) {
ret.
count
+= value[i].
count
;
}
return
ret;
},
verbose:
true
})
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
res = db.runCommand({
mapreduce:
'mo_log_201208'
,
map:
function
() {
emit({Spcode:this.Spcode, Spname:this.Spname,
Consignid:this.Consignid, Consname:this.Consname,
Region:this.Region, Regionname:this.Regionname,
Serviceid:this.Serviceid, Servicename:this.Servicename,
Srctermid:this.Srctermid}, {
count
:1});
},
reduce:
function
(
key
, value) {
var ret = {
count
:0};
ret.
count
++;
return
ret;
},
out
:
'tmp_mo_spcode_consignid_region_serviceid_1'
,
verbose:
true
})
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
res = db.runCommand({
mapreduce:
'tmp_mo_spcode_consignid_region_serviceid_1'
,
map:
function
() {
emit({Spcode:this._id.Spcode, Spname:this._id.Spname,
Consignid:this._id.Consignid, Consname:this._id.Consname,
Srctermid:this._id.Srctermid}, {
count
:1});
},
reduce:
function
(
key
, value) {
var ret = {
count
:0};
ret.
count
++;
return
ret;
},
out
:
'tmp_mo_spcode_consignid_region_serviceid_2'
,
verbose:
true
})
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
res = db.runCommand({
mapreduce:
'tmp_spcode_consignid_region_serviceid_2'
,
map:
function
() {
emit({Spcode:this._id.Spcode, Spname:this._id.Spname,
Consignid:this._id.Consignid, Srct:this._id.Consname},
{
count
:this.value.
count
});
},
reduce:
function
(
key
, value) {
var ret = {
count
:0};
for
(var i
in
value) {
ret.
count
+= value[i].
count
;
}
return
ret;
},
verbose:
true
})
|
1
2
3
4
5
6
7
8
9
10
11
12
|
> db[res.result].findOne();
{
"_id"
: {
"Spcode"
:
"20017"
,
"Spname"
:
"es"
,
"Consignid"
:
"000000"
,
"Consname"
:
"pa"
},
"value"
: {
"count"
: 978
}
}
|
总结:mongodb 对map/reduce 的支持是单线程的, 灰常消耗系统资源, 避免在前台使用。
送上萌妹之音一枚~~~~