MongoDB高级查询
数据
from pymongo import MongoClient
import datetime
client= MongoClient( 'mongodb://root:123@192.168.32.100:27017' )
table= client[ 'db1' ] [ 'emp' ]
l= [
( '张飞' , 'male' , 18 , '20170301' , 'teacher' , 7300.33 , 401 , 1 ) ,
( '张云' , 'male' , 78 , '20150302' , 'teacher' , 1000000.31 , 401 , 1 ) ,
( '刘备' , 'male' , 81 , '20130305' , 'teacher' , 8300 , 401 , 1 ) ,
( '关羽' , 'male' , 73 , '20140701' , 'teacher' , 3500 , 401 , 1 ) ,
( '曹操' , 'male' , 28 , '20121101' , 'teacher' , 2100 , 401 , 1 ) ,
( '诸葛亮' , 'female' , 18 , '20110211' , 'teacher' , 9000 , 401 , 1 ) ,
( '周瑜' , 'male' , 18 , '19000301' , 'teacher' , 30000 , 401 , 1 ) ,
( '司马懿' , 'male' , 48 , '20101111' , 'teacher' , 10000 , 401 , 1 ) ,
( '袁绍' , 'female' , 48 , '20150311' , 'sale' , 3000.13 , 402 , 2 ) ,
( '张全蛋' , 'female' , 38 , '20101101' , 'sale' , 2000.35 , 402 , 2 ) ,
( '鹌鹑蛋' , 'female' , 18 , '20110312' , 'sale' , 1000.37 , 402 , 2 ) ,
( '王尼玛' , 'female' , 18 , '20160513' , 'sale' , 3000.29 , 402 , 2 ) ,
( '我尼玛' , 'female' , 28 , '20170127' , 'sale' , 4000.33 , 402 , 2 ) ,
( '杨过' , 'male' , 28 , '20160311' , 'operation' , 10000.13 , 403 , 3 ) ,
( '小龙女' , 'male' , 18 , '19970312' , 'operation' , 20000 , 403 , 3 ) ,
( '郭靖' , 'female' , 18 , '20130311' , 'operation' , 19000 , 403 , 3 ) ,
( '黄蓉' , 'male' , 18 , '20150411' , 'operation' , 18000 , 403 , 3 ) ,
( '梅超风' , 'female' , 18 , '20140512' , 'operation' , 17000 , 403 , 3 )
]
for n, item in enumerate ( l) :
d= {
"_id" : n,
'name' : item[ 0 ] ,
'sex' : item[ 1 ] ,
'age' : item[ 2 ] ,
'hire_date' : datetime. datetime. strptime( item[ 3 ] , '%Y%m%d' ) ,
'post' : item[ 4 ] ,
'salary' : item[ 5 ]
}
table. save( d)
聚合函数
db. emp. aggregate(
{ "$group" : {
"_id" : "$post" ,
"最高工资" : { "$max" : "$salary" } ,
"最低工资" : { "$min" : "$salary" } ,
"平均工资" : { "$avg" : "$salary" } ,
"总工资" : { "$sum" : "$salary" } ,
"人数" : { "$sum" : 1 }
} }
)
db. emp. aggregate(
{ "$group" : {
"_id" : "$post" ,
"第一个" : { "$first" : "$name" } ,
"最后一个" : { "$last" : "$name" }
} }
)
db. emp. aggregate(
{ "$group" : {
"_id" : "$post" ,
"人员名单" : { "$push" : "$name" }
} }
)
db. emp. aggregate(
{ "$group" : {
"_id" : "$post" ,
"人员名单" : { "$addToSet" : "$name" }
} }
)
$match 过滤
db. emp. aggregate(
{ "$match" : { "name" : "鹌鹑蛋" } }
)
db. emp. aggregate(
{ "$match" : { "_id" : { "$gt" : 3 } } }
)
db. emp. aggregate(
{ "$group" : {
"_id" : "$post" ,
"max_salary" : { "$max" : "$salary" }
} } ,
{ "$match" : { "max_salary" : { "$gt" : 10000 } } } ,
{ "$match" : { "_id" : { "$ne" : "teacher" } } }
)
$project
db. emp. aggregate(
{ "$project" : {
"name" : 1 ,
"_id" : 0 ,
"year_salary" : { "$multiply" : [ 12 , "$salary" ] }
} }
)
db. emp. aggregate(
{ "$project" : {
"name" : 1 ,
"_id" : 0 ,
"after_10_year" : { "$add" : [ 10 , "$age" ] }
} }
db. emp. aggregate(
{ "$project" : {
"name" : 1 ,
"_id" : 0 ,
"befor_10_year" : { "$subtract" : [ "$age" , 10 ] }
} }
)
db. emp. aggregate(
{ "$project" : {
"name" : 1 ,
"_id" : 0 ,
"year" : { "$year" : "$hire_date" }
} }
)
db. emp. aggregate(
{ "$project" : {
"name" : 1 ,
"_id" : 0 ,
"job_year" : { "$subtract" : [ { "$year" : new Date( ) } , { "$year" : "$hire_date" } ] }
} }
)
db. emp. aggregate(
{ "$project" : {
"first_name" : { "$substr" : [ "$name" , 0 , 3 ] } ,
"_id" : 0
} }
)
db. emp. aggregate(
{ "$project" : {
"full_info" : { "$concat" : [ "$name" , "$post" ] } ,
"_id" : 0
} }
)
s
o
r
t
,
sort ,
s o r t , limit,$skip
db. emp. aggregate(
{ "$sort" : { "_id" : 1 } } ,
{ "$limit" : 10 } ,
{ "$skip" : 1 }
)
$sample
db. emp. aggregate( { "$sample" : { "size" : 3 } } )
可视化工具
https: // robomongo. org
selenium + mongodb爬取京东商品
== == == == == == == == == == == == == == == spider. py== == == == == == == == == == == == == =
import time
from urllib. parse import urlencode
from selenium. webdriver import Chrome
from selenium. webdriver. support. wait import WebDriverWait
from selenium. webdriver. support import expected_conditions as EC
from selenium. webdriver. common. by import By
from mongoTest import DBTool
driver = Chrome( )
kw = "黄金"
par = { "enc" : "utf-8" , "keyword" : kw, "wq" : kw}
kw = urlencode( par)
url = "https://search.jd.com/Search?" + kw
driver. get( url)
driver. implicitly_wait( 10 )
height = driver. execute_script( "return document.body.clientHeight" )
driver. implicitly_wait( 5 )
def get_datas ( ) :
ul = driver. find_element_by_class_name( "gl-warp" )
items = ul. find_elements_by_class_name( "gl-item" )
if len ( items) == 60 :
return items
return get_datas( )
def paser_data ( items) :
print ( len ( items) )
for i in items:
link = i. find_element_by_css_selector( ".p-img a" ) . get_attribute( "href" )
img = i. find_element_by_css_selector( ".p-img a img" ) . get_attribute( "src" )
if not img:
img = i. find_element_by_css_selector( ".p-img a img" ) . get_attribute( "data-lazy-img" )
img = "https:" + img
price = i. find_element_by_css_selector( ".p-price i" ) . text
title = i. find_element_by_css_selector( ".p-name a em" ) . text
shop_name = i. find_element_by_css_selector( ".p-shop a" ) . text
commit = i. find_element_by_css_selector( ".p-commit" ) . text
print ( "=========================================" )
dic = { "link" : link, "img" : img, "title" : title, "shop_name" : shop_name, "commit" : commit, "price" : price}
DBTool. insert_data( dic)
def get_next ( ) :
next = driver. find_element_by_partial_link_text( "下一页" )
next . click( )
driver. execute_script( """
window.scrollTo({
top: %s,
behavior: "smooth"
});""" % height)
print ( "11111" )
time. sleep( 1 )
print ( "continue...." )
items = get_datas( )
paser_data( items)
driver. execute_script( """
window.scrollTo({
top: %s,
behavior: "smooth"
});""" % height)
items = get_datas( )
paser_data( items)
for i in range ( 5 ) :
get_next( )
time. sleep( 30 )
driver. close( )
== == == == == == == == == == == == == == == DBTool. py== == == == == == == == == == == == == ==
"""
连接数据库 保存数据
"""
from pymongo import MongoClient
table = None
c = None
def connect_server ( ) :
global table, c
c = MongoClient( "mongodb://root:123@192.168.32.100:27017" )
table = c[ "jd" ] [ "jd_data" ]
print ( table)
def insert_data ( data) :
if not table:
connect_server( )
table. insert( data)
def close ( ) :
c. close( )