2.1 在 MONGODB中筛选房源
再在 1-3练习的基础上做修改。获取了url后只需要在获取详情页处添加:
for title,district,img,price,hostName,hostPicSrc,hostSexual in zip(titles,districts,imgs,prices,hostNames,hostPicSrcs,hostSexuals):
data={
'title =':title.get_text(),
'district=':district.get_text().strip(),
'price=': price.get_text(),
'hostName=': hostName.get_text(),
'hostPicSrc=': hostPicSrc.get('src'),
'hostSexual=': GetSuxual(hostSexual.get('class')),
'img=': img.get('src'),
}
sheetTab.insert_one(data)
,然后看到数据库中就把这些添加进去了。
接下来,先把刚才加上去的那句话,sheetTab.insert_one去掉,以免再次运行的时候又反复添加。然后添加筛选条件:
for item in SheetTab.find({ 'price=':{ '$gt':'500'}})
代码
#coding=utf-8
from bs4 import BeautifulSoup
import requests
import time
import pymongo
client = pymongo.MongoClient('localhost',27017)
HouseRent = client['HouseRent']
SheetTab = HouseRent['sheetTab']
url = 'http://bj.xiaozhu.com/search-duanzufang-p1-0/'
header = {
'Content-type': 'text/html;charset=UTF-8',
# 'Referer': 'http://bj.58.com/pbdn/?PGTID=0d409654-01aa-6b90-f89c-4860fd7f9294&ClickID=1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36',
}
#计数的变量
icount = [0]
#获取一个大页面上的详情页地址,并且 判断 当前所抓取的所有url个数是否大于了nbOfUrl
def GetOnePageUrl(url,icount,nbOfUrl):
url_list = []
web_data = requests.get(url,headers = header) # 正常情况下是 Responce[200]
print('请检查当前网络是否正常',web_data.status_code)
soup = BeautifulSoup(web_data.text,'lxml')
urlOnDetail = soup.select('#page_list > ul > li > a ')
#把一个这个页面下的所有详情页的URL装进一个列表里
for urlOnDetail_1 in urlOnDetail:
url_list.append(urlOnDetail_1.get('href'))
#从 urlOnDetail_1里获取数据,装进对象里。或者
icount[0] += 1
if(icount[0] >= nbOfUrl):
break
print('读取URL条数 :',icount[0])
return url_list
#当前页面翻页到下一页
def gotoNextPage(url):
nPage = int(url[-4]) #是否需要添加异常处理.. 如果这个不是数字呢,返回的是什么
a = int(nPage);a += 1
url_s = 'http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(a)
return url_s
#按详情个数去爬,比如爬300条 urls = GetNumberDetail(300) def GetPageUrl_ForPage(nb):
url_ToChange = url
urllist = []
while(icount[0]<nb):
urllist.extend(GetOnePageUrl(url_ToChange, icount, nb))
url_ToChange = gotoNextPage(url)
if(icount[0] > nb):
break
time.sleep(2)
return urllist
#给定大页面个数,按大页面去爬,不管每一页包含有多少详情页
def GetNBPageDetail(nPage):
urllist = []
for i in range(1,nPage+1):
url_ToChange = 'http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(i)
urllist.extend(GetOnePageUrl(url_ToChange, icount,1000000)) #本意是不让这个函数因为到达了nb而跳出,那就把nb设很大
time.sleep(2)
return urllist
#根据传进来的参数来判断性别 #男的是member_ico,女 保存的member_icol
def GetSuxual(strList):
try:
if(len(strList[0])==10):
return '男'
elif(len(strList[0])==11):
return '女'
else:
print('检查一下,性别好像没抓对哦',strList)
return None
except(IndexError):
print('检查一下,性别好像没抓到哦')
return None
#获取一个详情页上的所有信息,并返回一个字典()
def GetOneDetailInfor(url):
#需要获取的数据有: title ,district, price, hostPicSrc,hostSexual,
web_data = requests.get(url,headers=header)
soup = BeautifulSoup(web_data.text,'lxml')
titles = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em')
imgs = soup.select('#curBigImage ')
districts = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span.pr5') #它应该返回的是一个列表
prices = soup.select('#pricePart > div.day_l > span')
hostNames = soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')
hostPicSrcs = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img')
hostSexuals = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > div') #它根据字符数目来判断
for title,district,img,price,hostName,hostPicSrc,hostSexual in zip(titles,districts,imgs,prices,hostNames,hostPicSrcs,hostSexuals):