python 自如爬虫

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from pandas import Series,DataFrame
# from selenium.webdriver.common.keys import Keys


num = 23008613
url = 'http://www.ziroom.com/z/nl/z2-d%d.html'%num
fireFoxOptions = webdriver.FirefoxOptions()
fireFoxOptions.set_headless()
brower = webdriver.Firefox(firefox_options=fireFoxOptions)
brower.get(url)
locator = (By.XPATH,'/html/body/div[4]/div/div/div/dl[2]/dd/ul')
WebDriverWait(brower,10,0.5).until(EC.presence_of_element_located(locator))
soup = BeautifulSoup(brower.page_source,'lxml')
areaList = soup.find('ul',class_='clearfix filterList')

index = 0
frame1 = DataFrame(columns=['区','街道','网址'])

for area in areaList.contents[3:-1:2]:
    nameList = area.stripped_strings
    areaName = list(nameList)[0]
    littleAreaList = area.div.contents[3:-1:2]
    for littleArea in littleAreaList:
        frame1.loc[index,'区'] = areaName
        frame1.loc[index,'街道'] = littleArea.a.string
        frame1.loc[index,'网址'] = 'http:'+littleArea.a.get('href')
        index += 1

frame1.to_excel('自如网址1.xlsx')
#先爬的具体到街道的搜索列表页网址,因为搜索页上限展示50页,以区的级别爬的话朝阳可能会少爬一些数据,所以我是以街道的分类怕的,慢点,但我不急,定个时,一天爬一次
import pandas as pd
import re
import pytesseract
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from pandas import DataFrame
from io import BytesIO
from PIL import Image

frame = pd.read_excel('自如网址.xlsx',index_col=0)
options = webdriver.ChromeOptions()
options.set_headless()
driver = webdriver.Chrome(chrome_options=options)

index = 0
index_1 = 0
frame_1 = DataFrame(columns=['区','街道','小区','房间数','房间朝向','房间网址',\
                             '面积','楼层','最近地铁距离','价格'])
box = (180,316,820,383)


for url in frame.iloc[:,2]:
    driver.get(url)
    locator_f = (By.ID,'page')
    WebDriverWait(driver,10,0.5).until(EC.presence_of_element_located(locator_f))
    locator = (By.ID,'houseList')
    WebDriverWait(driver,10,0.5).until(EC.presence_of_element_located(locator))
    soup = BeautifulSoup(driver.page_source,'lxml')
    if len(soup.find_all('div',class_='nomsg area')) == 1:
        continue
    houseList = soup.find_all('li',class_='clearfix')

    for house in houseList:
        if len(house['class']) > 1:
            continue
        title = house.find('a',class_='t1').string
        littleArea = title.split('·')[1].split('-')[0][:-3]
        amountRoom = int(title.split('·')[1].split('-')[0][-3])
        towards = title.split('·')[1].split('-')[1]
        houseUrl = house.find('a',class_='t1')['href']
        detail = list(house.find('div',class_='detail').stripped_strings)        
        frame_1.loc[index,'区'] = frame.loc[index_1,'区']
        frame_1.loc[index,'街道'] = frame.loc[index_1,'街道']
        frame_1.loc[index,'小区'] = littleArea
        frame_1.loc[index,'房间数'] = amountRoom
        frame_1.loc[index,'房间朝向'] = towards
        frame_1.loc[index,'房间网址'] = houseUrl
        frame_1.loc[index,'面积'] = detail[0]
        frame_1.loc[index,'楼层'] = detail[2]
        frame_1.loc[index,'最近地铁距离'] = detail[-1]
        frame_1.loc[index,'价格'] = getprice(house,numList)
        index += 1
    try:
        pageNum = int(soup.find('div',class_='pages').find_all('span')[-4].string[1:-1])
    except IndexError:
        index_1 += 1
        continue
            
    for i in range(2,pageNum+1):
        newUrl = url+'?p=%d'%i
        driver.get(newUrl)    
        locator = (By.ID,'houseList')
        WebDriverWait(driver,10,0.5).until(EC.presence_of_element_located(locator))
        soup = BeautifulSoup(driver.page_source,'lxml')
        houseList = soup.find_all('li',class_='clearfix')
        
        for house in houseList:
            if len(house['class']) > 1:
                continue
            title = house.find('a',class_='t1').string
            littleArea = title.split('·')[1].split('-')[0][:-3]
            amountRoom = int(title.split('·')[1].split('-')[0][-3])
            towards = title.split('·')[1].split('-')[1]
            houseUrl = house.find('a',class_='t1')['href']
            detail = list(house.find('div',class_='detail').stripped_strings)    
            frame_1.loc[index,'区'] = frame.loc[index_1,'区']
            frame_1.loc[index,'街道'] = frame.loc[index_1,'街道']
            frame_1.loc[index,'小区'] = littleArea
            frame_1.loc[index,'房间数'] = amountRoom
            frame_1.loc[index,'房间朝向'] = towards
            frame_1.loc[index,'房间网址'] = houseUrl
            frame_1.loc[index,'面积'] = detail[0]
            frame_1.loc[index,'楼层'] = detail[2]
            frame_1.loc[index,'最近地铁距离'] = detail[-1]
            frame_1.loc[index,'价格'] = getprice(house,numList)
            index += 1
    index_1 += 1
           
frame_1.to_excel('自如爬虫.xlsx')



最近事多,所以隔了这么久才把这个爬虫写出来。还有些细节要完善,比如顺便把房间标签也爬了,还有价格是按天还是按月的单位。自如的搜索页做了防爬处理,不想让别人抓取,所以价格的这部分代码我删了。

但我还是讲讲原理吧。

价格是通过加载背景图片并设定迁移变量的方式加载出来的,每个页面一个对应的背景图片。

方法一,单品信息页的价格没做这个处理,所以可以根据搜索列表页的网址跳转过去抓取。不过这种方法一个单品加载一次新网页,应该比较慢,但也可以顺便把住户信息也爬了。

方法二,截取图片进行识别。可以直接截取页面图片,锁定价格位置,识别价格。或者截取页面价格对应的背景图片和偏移量,算出价格。

这里贴几个有帮助的网址

http://imweb.io/topic/595b7161d6ca6b4f0ac71f05

https://blog.csdn.net/djd1234567/article/details/50739872

我就是照着这些网址学的识别图片,识别图片时可以做一下二值化处理,再裁切一下,提高精度


接下来估计要学多线程加快速度,这样爬虫了解的差不多了,就可以学现成的框架了,应该会学scrapy。

ps: 我准备用爬到的数据做个回归,算一下自如的价格模型。


  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值