from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from pandas import Series,DataFrame
# from selenium.webdriver.common.keys import Keys
num = 23008613
url = 'http://www.ziroom.com/z/nl/z2-d%d.html'%num
fireFoxOptions = webdriver.FirefoxOptions()
fireFoxOptions.set_headless()
brower = webdriver.Firefox(firefox_options=fireFoxOptions)
brower.get(url)
locator = (By.XPATH,'/html/body/div[4]/div/div/div/dl[2]/dd/ul')
WebDriverWait(brower,10,0.5).until(EC.presence_of_element_located(locator))
soup = BeautifulSoup(brower.page_source,'lxml')
areaList = soup.find('ul',class_='clearfix filterList')
index = 0
frame1 = DataFrame(columns=['区','街道','网址'])
for area in areaList.contents[3:-1:2]:
nameList = area.stripped_strings
areaName = list(nameList)[0]
littleAreaList = area.div.contents[3:-1:2]
for littleArea in littleAreaList:
frame1.loc[index,'区'] = areaName
frame1.loc[index,'街道'] = littleArea.a.string
frame1.loc[index,'网址'] = 'http:'+littleArea.a.get('href')
index += 1
frame1.to_excel('自如网址1.xlsx')
#先爬的具体到街道的搜索列表页网址,因为搜索页上限展示50页,以区的级别爬的话朝阳可能会少爬一些数据,所以我是以街道的分类怕的,慢点,但我不急,定个时,一天爬一次
import pandas as pd
import re
import pytesseract
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from pandas import DataFrame
from io import BytesIO
from PIL import Image
frame = pd.read_excel('自如网址.xlsx',index_col=0)
options = webdriver.ChromeOptions()
options.set_headless()
driver = webdriver.Chrome(chrome_options=options)
index = 0
index_1 = 0
frame_1 = DataFrame(columns=['区','街道','小区','房间数','房间朝向','房间网址',\
'面积','楼层','最近地铁距离','价格'])
box = (180,316,820,383)
for url in frame.iloc[:,2]:
driver.get(url)
locator_f = (By.ID,'page')
WebDriverWait(driver,10,0.5).until(EC.presence_of_element_located(locator_f))
locator = (By.ID,'houseList')
WebDriverWait(driver,10,0.5).until(EC.presence_of_element_located(locator))
soup = BeautifulSoup(driver.page_source,'lxml')
if len(soup.find_all('div',class_='nomsg area')) == 1:
continue
houseList = soup.find_all('li',class_='clearfix')
for house in houseList:
if len(house['class']) > 1:
continue
title = house.find('a',class_='t1').string
littleArea = title.split('·')[1].split('-')[0][:-3]
amountRoom = int(title.split('·')[1].split('-')[0][-3])
towards = title.split('·')[1].split('-')[1]
houseUrl = house.find('a',class_='t1')['href']
detail = list(house.find('div',class_='detail').stripped_strings)
frame_1.loc[index,'区'] = frame.loc[index_1,'区']
frame_1.loc[index,'街道'] = frame.loc[index_1,'街道']
frame_1.loc[index,'小区'] = littleArea
frame_1.loc[index,'房间数'] = amountRoom
frame_1.loc[index,'房间朝向'] = towards
frame_1.loc[index,'房间网址'] = houseUrl
frame_1.loc[index,'面积'] = detail[0]
frame_1.loc[index,'楼层'] = detail[2]
frame_1.loc[index,'最近地铁距离'] = detail[-1]
frame_1.loc[index,'价格'] = getprice(house,numList)
index += 1
try:
pageNum = int(soup.find('div',class_='pages').find_all('span')[-4].string[1:-1])
except IndexError:
index_1 += 1
continue
for i in range(2,pageNum+1):
newUrl = url+'?p=%d'%i
driver.get(newUrl)
locator = (By.ID,'houseList')
WebDriverWait(driver,10,0.5).until(EC.presence_of_element_located(locator))
soup = BeautifulSoup(driver.page_source,'lxml')
houseList = soup.find_all('li',class_='clearfix')
for house in houseList:
if len(house['class']) > 1:
continue
title = house.find('a',class_='t1').string
littleArea = title.split('·')[1].split('-')[0][:-3]
amountRoom = int(title.split('·')[1].split('-')[0][-3])
towards = title.split('·')[1].split('-')[1]
houseUrl = house.find('a',class_='t1')['href']
detail = list(house.find('div',class_='detail').stripped_strings)
frame_1.loc[index,'区'] = frame.loc[index_1,'区']
frame_1.loc[index,'街道'] = frame.loc[index_1,'街道']
frame_1.loc[index,'小区'] = littleArea
frame_1.loc[index,'房间数'] = amountRoom
frame_1.loc[index,'房间朝向'] = towards
frame_1.loc[index,'房间网址'] = houseUrl
frame_1.loc[index,'面积'] = detail[0]
frame_1.loc[index,'楼层'] = detail[2]
frame_1.loc[index,'最近地铁距离'] = detail[-1]
frame_1.loc[index,'价格'] = getprice(house,numList)
index += 1
index_1 += 1
frame_1.to_excel('自如爬虫.xlsx')
最近事多,所以隔了这么久才把这个爬虫写出来。还有些细节要完善,比如顺便把房间标签也爬了,还有价格是按天还是按月的单位。自如的搜索页做了防爬处理,不想让别人抓取,所以价格的这部分代码我删了。
但我还是讲讲原理吧。
价格是通过加载背景图片并设定迁移变量的方式加载出来的,每个页面一个对应的背景图片。
方法一,单品信息页的价格没做这个处理,所以可以根据搜索列表页的网址跳转过去抓取。不过这种方法一个单品加载一次新网页,应该比较慢,但也可以顺便把住户信息也爬了。
方法二,截取图片进行识别。可以直接截取页面图片,锁定价格位置,识别价格。或者截取页面价格对应的背景图片和偏移量,算出价格。
这里贴几个有帮助的网址
http://imweb.io/topic/595b7161d6ca6b4f0ac71f05
https://blog.csdn.net/djd1234567/article/details/50739872
我就是照着这些网址学的识别图片,识别图片时可以做一下二值化处理,再裁切一下,提高精度
接下来估计要学多线程加快速度,这样爬虫了解的差不多了,就可以学现成的框架了,应该会学scrapy。
ps: 我准备用爬到的数据做个回归,算一下自如的价格模型。