第六章 聚合
第四章
一 count
返回集合中的文档数量
> db.users.count()
5
二 distinct
查询所有给定键的所有不同的值
>db.runCommand({"distinct":"users","key":"username"})
{
"waitedMS" : NumberLong(0),
"values" : [
"andy",
"ann",
"mike",
"retacn yue"
],
"stats" : {
"n" : 4,
"nscanned" : 4,
"nscannedObjects" :0,
"timems" : 88,
"planSummary" :"DISTINCT_SCAN { username: 1.0 }"
},
"ok" : 1
}
三 group
在shell中运行
#查询所有数据
> db.prices.find()
{ "_id" :ObjectId("5872e51a2e29db2d68661bd3"), "time" :"10/03/2016 01:05:02", "price" : 4.23, "day" :"2016/10/03" }
{ "_id" : ObjectId("5872e51a2e29db2d68661bd4"),"time" : "10/04/2016 11:05:02", "price" : 4.27,"day" : "2016/10/04" }
{ "_id" :ObjectId("5872e51a2e29db2d68661bd5"), "time" :"10/03/2016 05:05:02", "price" : 4.1, "day" :"2016/10/03" }
{ "_id" :ObjectId("5872e51a2e29db2d68661bd6"), "time" :"10/06/2016 05:05:02", "price" : 4.3, "day" :"2016/10/06" }
{ "_id" :ObjectId("5872e51a2e29db2d68661bd7"), "time" :"10/04/2016 08:05:02", "price" : 4.01, "day" :"2016/10/04" }
>
#分组查询
> db.runCommand({"group":{
... "ns":"prices",
... "key":"day",
..."initial":{"time":""},
... "$reduce":function(doc,prev){
... if(doc.time>prev.time){
... prev.price=doc.price;
... prev.time=doc.time;
... }}}})
{
"waitedMS" : NumberLong(0),
"retval" : [
{
"time" :"10/06/2016 05:05:02",
"price" : 4.3
}
],
"count" : NumberLong(5),
"keys" : NumberLong(1),
"ok" : 1
}
#添加分组条件
> db.runCommand({"group":{
"ns":"prices",
"key":"day",
"initial":{"time":""},
"$reduce":function(doc,prev){
if(doc.time>prev.time){
prev.price=doc.price;
prev.time=doc.time;
}
},
"condition":{"day":{"$gt":"2016/09/30"}}}})
{
"waitedMS" : NumberLong(0),
"retval" : [
{
"time" :"10/06/2016 05:05:02",
"price" : 4.3
}
],
"count" : NumberLong(5),
"keys" : NumberLong(1),
"ok" : 1
}
使用python进行分组查询,示例代码如下:
from pymongo import MongoClient import statsmodels.api as sm import json import pandas as pd import time from datetime import datetime, date, time, timezone, timedelta import datetime as dt import time # 创建数据库连接 client = MongoClient() db = client.test # 添加记录 # db.prices.insert({"day":"2016/10/03","time":"10/03/2016 01:05:02","price":4.23}) # db.prices.insert({"day":"2016/10/04","time":"10/04/2016 11:05:02","price":4.27}) # db.prices.insert({"day":"2016/10/03","time":"10/03/2016 05:05:02","price":4.10}) # db.prices.insert({"day":"2016/10/06","time":"10/06/2016 05:05:02","price":4.30}) # db.prices.insert({"day":"2016/10/04","time":"10/04/2016 08:05:02","price":4.01}) # 分组查询 func = """ function(doc, prev){ if (doc.time > prev.time){ prev.price = doc.price; prev.time = doc.time; } } """ # 函数原型 # def group(self, # key, 进行分组的键 # condition, 设置分组条件 # initial, 指定reduce中参数 # reduce, 每一条记录都作为reduce函数的第一个参数执行 # finalize=None, # **kwargs):
gp = db.prices.group({"day": 1}, {"day": {"$gt": "2016/09/30"}}, {"time": ""}, func) print(gp)
查询结果如下
[{'day': '2016/10/03', 'price': 4.1,'time': '10/03/2016 05:05:02'},
{'day': '2016/10/04', 'price': 4.27,'time': '10/04/2016 11:05:02'},
{'day': '2016/10/06', 'price': 4.3, 'time':'10/06/2016 05:05:02'}]
使用完成器
#添加博客数据
>db.blog.posts.insert({"title":"A blogpost2","content":"...","author":{"name":"andy","email":"andy@sina.com"},"comments":[{"name":"tan","email":"tan
@sina.com","content":"good","votes":2}],"tags":{"java":5,"c++":2},"day":"2017/01/10"})
>db.blog.posts.insert({"title":"A blog post3","content":"...","author":{"name":"ruby","email":"ruby@sina.com"},"comments":[{"name":"san","email":"san
@sina.com","content":"good","votes":2}],"tags":{"c":6,"winter":2,"nosql":15},"day":"2017/01/11"})
WriteResult({ "nInserted" : 1 })
#查询所有数据
> db.blog.posts.find()
{ "_id" :ObjectId("586f21115da1e3bac2c373fe"), "title" : "Ablog post", "content" : "...", "author" : {"name" : "retacn yue", "email" : "zhenhuayue@
sina.com" }, "comments" : [{ "name" : "yue", "email" :"zhenhuayue@126.com", "content" : "good","votes" : 11 }, { "name" : "three","email" : "zhenh
uayue@hotmail.com","content" : "better", "votes" : 8 }, {"name" : "zhenhua", "email" :"zhenhuayue@qq.com", "content" : "best","votes" : 11 } ], "t
ags" : { "nosql" : 4,"python" : 10, "slendding" : 2 }, "day" :"2017/01/09" }
{ "_id" :ObjectId("5873036164a560b0e9430160"), "title" : "Ablog post2", "content" : "...", "author" : {"name" : "andy", "email" : "andy@sina.com"}
, "comments" : [ {"name" : "tan", "email" :"tan@sina.com", "content" : "good","votes" : 2 } ], "tags" : { "java" : 5,"c++" : 2 }, "day" : "2017/01
/10" }
{ "_id" :ObjectId("587303ed64a560b0e9430161"), "title" : "Ablog post3", "content" : "...", "author" : {"name" : "ruby", "email" : "ruby@sina.com"}
, "comments" : [ { "name": "san", "email" : "san@sina.com","content" : "good", "votes" : 2 } ],"tags" : { "c" : 6, "winter" : 2,"nosql" : 15 }, "d
ay" : "2017/01/11" }
#分组
> db.runCommand({"group":{
... "ns":"blog.posts",
..."key":{"tags":true},
..."initial":{"tags":{}},
... "$reduce":function(doc,prev){
... for(i in doc.tags){
... if(doc.tags[i] in prev.tags){
... prev.tags[doc.tags[i]]++;
... }else{
... prev.tags[doc.tags[i]]=1;
... }
... }
... }
... }})
#查询结果:
{
"waitedMS" : NumberLong(0),
"retval" : [
{
"tags" : {
"2" :1,
"4" :1,
"10": 1
}
},
{
"tags" : {
"2" :1,
"5" :1
}
},
{
"tags" : {
"2" :1,
"6" :1,
"15": 1
}
}
],
"count" : NumberLong(3),
"keys" : NumberLong(3),
"ok" : 1
}
#使用完成器(finalizer)精减后的分组
> db.runCommand({"group":{
... "ns":"blog.posts",
..."key":{"tags":true},
..."initial":{"tags":{}},
... "$reduce":function(doc,prev){
... for(i in doc.tags){
... if(doc.tags[i] in prev.tags){
... prev.tags[doc.tags[i]]++;
... }else{
... prev.tags[doc.tags[i]]=1;
... }
... }
... },
... "finalize":function(prev){
... var mostPopular=0;
... for(i in prev.tags){
... if(prev.tags[i]>mostPopular){
... prev.tag=i;
... mostPopulat=prev.tags[i];
... }
... }
... delete prev.tags
... }
... }})
{
"waitedMS" : NumberLong(0),
"retval" : [
{
"tag" :"10"
},
{
"tag" :"5"
},
{
"tag" :"15"
}
],
"count" : NumberLong(3),
"keys" : NumberLong(3),
"ok" : 1
}
>
Python代码实现,
from pymongo import MongoClient import statsmodels.api as sm import json import pandas as pd import time from datetime import datetime, date, time, timezone, timedelta import datetime as dt import time # 创建数据库连接 client = MongoClient() db = client.test func = """ function(doc, prev){ if (doc.time > prev.time){ prev.price = doc.price; prev.time = doc.time; } } """ func_none = """function(doc,prev){ for(i in doc.tags){ if(doc.tags[i] in prev.tags){ prev.tags[doc.tags[i]]++; }else{ prev.tags[doc.tags[i]]=1; } } } """ func_finalize = """function(prev){ var mostPopular=0; for(i in prev.tags){ if(prev.tags[i]>mostPopular){ prev.tag=i; mostPopulat=prev.tags[i]; } } delete prev.tags } """ # 函数原型 # def group(self, # key, 进行分组的键 # condition, 设置分组条件 # initial, 指定reduce中参数 # reduce, 每一条记录都作为reduce函数的第一个参数执行 # finalize=None, 完成器精减数据库到用户的数据 # **kwargs): # gp = db.prices.group({"day": 1}, {"day": {"$gt": "2016/09/30"}}, {"time": ""}, func) # 分组 gp_none = db.blog.posts.group({"tags": True}, {},{"tags": {}}, func_none ) print("gp_none", gp_none) # 使用完成器(finalizer)精减后的分组 gp_finalizer = db.blog.posts.group({"tags": True},{}, {"tags": {}}, func_none, finalize= func_finalize) print("gp_finalizer", gp_finalizer)
运行结果如下:
gp_none [{'tags': {'10': 1.0, '4': 1.0,'2': 1.0}}, {'tags': {'5': 1.0, '2': 1.0}}, {'tags': {'6': 1.0, '15': 1.0, '2':1.0}}]
gp_finalizer [{'tag': '10'}, {'tag': '5'},{'tag': '15'}]
将函数作为键使用
#将函数作为分级的键来使用,示例代码如下:
Db.posts.group({“ns”:”blog.posts”,”$keyf”:function(x){returnx.columnName.toLowerCase();}
...
})
四 mapreduce
可以并行化到多个服务器的聚合方法,运行速度较慢,不适合作实时的数据处理
1 找出集合中的所有键
#emit返回要处理的值,this为当前映射文档
>map=function(){
for(var key inthis){
emit(key,{count:1});
}
};
#key是emit返回的值,emits对应键的{count:1}文档
> reduce=function(key,emits){
... total=0;
... for(var i in emits){
... total+=emits[i].count;
... }
... return {"count":total};
... }
#方式一
>mr=db.runCommand({"mapreduce":"blog.posts","map":map,"reduce":reduce,"out":{inline:1}})
{
"results" : [
{
"_id" :"_id",
"value" : {
"count" : 3
}
},
{
"_id" :"author",
"value" : {
"count" : 3
}
},
{
"_id" :"comments",
"value" : {
"count" : 3
}
},
{
"_id" :"content",
"value" : {
"count" : 3
}
},
{
"_id" :"day",
"value" : {
"count" : 3
}
},
{
"_id" :"tags",
"value" : {
"count" : 3
}
},
{
"_id" :"title",
"value" : {
"count" : 3
}
}
],
"timeMillis" : 442, #操作所需时间
"counts" : {
"input" : 3, #发送到map函数的文档个数
"emit" : 21, #emit被调用的次数
"reduce" : 7 #reduce调用次数,
"output" : 7 #结果集中的文档数量
},
"ok" : 1
}
#方式二
>db.users.mapReduce(map,reduce,{out:{inline:1}})
{
"results" : [
{
"_id" :"_id",
"value" : {
"count" : 5
}
},
{
"_id" :"age",
"value" : {
"count" : 4
}
},
{
"_id" :"emails",
"value" : {
"count" : 4
}
},
{
"_id" :"languages",
"value" : {
"count" : 4
}
},
{
"_id" :"location",
"value" : {
"count" : 4
}
},
{
"_id" :"relationships",
"value" : {
"count" : 4
}
},
{
"_id" :"sex",
"value" : {
"count" : 4
}
},
{
"_id" :"state",
"value" : {
"count" : 4
}
},
{
"_id" :"username",
"value" : {
"count" : 5
}
}
],
"timeMillis" : 51,
"counts" : {
"input" : 5,
"emit" : 38,
"reduce" : 9,
"output" : 9
},
"ok" : 1
}
2 网页分类
3 mongodb与mapReduce
Mapreduce的可选键
Finalize 处理过程的最后一步
Keeptemp 临时结果集是否保存
Output 结果集合名
Query 发往map前的过滤条件
Sort 发往map前腓序
Limit 发往map前文档数量上限
Scope js变量
Verbose 详细日志