python爬虫：案例三：去哪儿酒店价格信息

最新推荐文章于 2024-03-19 20:50:39 发布

ZJL-阿友

最新推荐文章于 2024-03-19 20:50:39 发布

阅读量1.5w

点赞数

分类专栏： python爬虫文章标签： python phantomjs selenium 爬虫去哪儿

本文链接：https://blog.csdn.net/u013055678/article/details/51485876

版权

python爬虫专栏收录该内容

18 篇文章 0 订阅

订阅专栏

#去哪儿单页面数据爬取：

#coding=utf-8
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )
import urllib
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import requests
import os
from selenium.webdriver.common.action_chains import ActionChains

flie_path='file:///'+os.path.abspath('test.html')
#flie_path="http://hotel.qunar.com/city/hangzhou/dt-11571/?tag=hangzhou#fromDate=2016-05-24&toDate=2016-05-25&q=%E5%A6%82%E5%AE%B6%E5%BF%AB%E6%8D%B7%E6%9D%AD%E5%B7%9E%E8%A5%BF%E6%B9%96%E9%BB%84%E9%BE%99%E6%97%85%E6%B8%B8%E9%9B%86%E6%95%A3%E4%B8%AD%E5%BF%83%E5%BA%97&from=qunarHotel|sug&fromFocusList=0&filterid=eaa9982b-b203-4ed2-bf7e-e063936143c3_A&showMap=0&qptype=hotelName|poi&haspoi=1&QHFP=ZSS_A7D48C73"


class Xc():

	def pc(self):
		driver = webdriver.PhantomJS()
		time.sleep(5)
		driver.get(flie_path)
		driver.implicitly_wait(30)
		time.sleep(5)
		dic={}
		aa = driver.find_elements_by_class_name('hotel-quote-list')
		for a in aa:
			lists=[]
			bb=a.find_elements_by_class_name('js-dprice')
			for b in bb:
				print b.text.strip()[1:]
				lists.append(b.text.strip()[1:])
			dic[a.find_element_by_class_name('rtype').find_element_by_tag_name('h2').text]=lists
			
		driver.quit
		return dic

s=Xc()
print s.pc()

结果：

212237212218218237256236236237237265237244244265265244244265254284254269284{u'\u5927\u5e8a\u623f': [u'237', u'256', u'236', u'236', u'237'], u'\u7279\u60e0\u5546\u52a1\u623f(\u65e0\u7a97)': [u'265', u'265', u'244', u'244', u'265'], u'\u5355\u4eba\u623f': [u'212', u'237', u'212', u'218', u'218'], u'\u6807\u51c6\u53cc\u4eba\u623f': [u'254', u'284', u'254', u'269', u'284'], u'\u7279\u60e0\u53cc\u4eba\u95f4': [u'237', u'265', u'237', u'244', u'244']}

这个只是一个简单的模型，数字是价格，unicode 是房型，上面的程序爬的是一个html文件，因为我这里的网速实在太慢，于是我把浏览器f12后的源码保存在一个html文件中爬取数据，这个页面的url我也贴在上面

上面的代码有点问题，取到的数据不全，因为有些价格信息被隐藏了，看页面上会有“查看其他3条报价”之类的超链接

改一下代码：

#coding=utf-8
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )
import urllib
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import requests
import os
from selenium.webdriver.common.action_chains import ActionChains

flie_path='file:///'+os.path.abspath('test.html')
#flie_path="http://hotel.qunar.com/city/hangzhou/dt-11571/?tag=hangzhou#fromDate=2016-05-24&toDate=2016-05-25&q=%E5%A6%82%E5%AE%B6%E5%BF%AB%E6%8D%B7%E6%9D%AD%E5%B7%9E%E8%A5%BF%E6%B9%96%E9%BB%84%E9%BE%99%E6%97%85%E6%B8%B8%E9%9B%86%E6%95%A3%E4%B8%AD%E5%BF%83%E5%BA%97&from=qunarHotel|sug&fromFocusList=0&filterid=eaa9982b-b203-4ed2-bf7e-e063936143c3_A&showMap=0&qptype=hotelName|poi&haspoi=1&QHFP=ZSS_A7D48C73"

class Xc():

	def pc(self):
		driver = webdriver.PhantomJS()
		time.sleep(5)
		driver.get(flie_path)
		driver.implicitly_wait(30)
		time.sleep(5)
		dic={}
		aa = driver.find_elements_by_class_name('hotel-quote-list')
		for a in aa:
			lists=[]
			left=a.find_element_by_class_name("js-expand-more")
			#找到“查看其他3条报价”的节点
			ActionChains(driver).click_and_hold(left).perform()
			#模拟点击
			driver.implicitly_wait(30)
			time.sleep(5)
			bb=a.find_elements_by_class_name('js-dprice')
			for b in bb:
				print b.text.strip()[1:]
				lists.append(b.text.strip()[1:])
			dic[a.find_element_by_class_name('rtype').find_element_by_tag_name('h2').text]=lists

		driver.quit
		return dic

s=Xc()
print s.pc()

模拟鼠标点击，展开整个节点

因为我是本地文件所以模拟也没有效果，不过过程中没有报错，理论上应该没有错误，如果有错误请指正！

上面的代码还是不能抓到完全的价格数据，去哪儿会把最后几个房型的价格隐藏住，需要点开“展开报价”，然后再点开“查看其他3条报价”之类的超链接，才能看到全部价格！

webdriver的class定位好像有点问题，find_element_by_class_name()中的class如果有空格就会报错，无法解析，所以我改用xpath

#coding=utf-8
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )
import urllib
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import requests
import os
from selenium.webdriver.common.action_chains import ActionChains

flie_path='file:///'+os.path.abspath('test.html')
#flie_path="http://hotel.qunar.com/city/hangzhou/dt-11571/?tag=hangzhou#fromDate=2016-05-24&toDate=2016-05-25&q=%E5%A6%82%E5%AE%B6%E5%BF%AB%E6%8D%B7%E6%9D%AD%E5%B7%9E%E8%A5%BF%E6%B9%96%E9%BB%84%E9%BE%99%E6%97%85%E6%B8%B8%E9%9B%86%E6%95%A3%E4%B8%AD%E5%BF%83%E5%BA%97&from=qunarHotel|sug&fromFocusList=0&filterid=eaa9982b-b203-4ed2-bf7e-e063936143c3_A&showMap=0&qptype=hotelName|poi&haspoi=1&QHFP=ZSS_A7D48C73"


class Xc():

	def pc(self):
		driver = webdriver.PhantomJS()
		time.sleep(5)
		driver.get(flie_path)
		driver.implicitly_wait(30)
		time.sleep(5)
		dic={}
		aa=driver.find_elements_by_xpath("//div[@class='room-item-inner room-item-wrapper']")
		for a in aa:
			lists=[]
			left_ll=a.find_element_by_xpath("//p[@class='btn-book-ct']/a")
			ActionChains(driver).click_and_hold(left_ll).perform()
			time.sleep(5)
			left=a.find_element_by_xpath("//a[@class='js-expand-more']")
			#找到“查看其他3条报价”的节点
			ActionChains(driver).click_and_hold(left).perform()
			#模拟点击
			driver.implicitly_wait(30)
			time.sleep(5)
			bb=a.find_elements_by_class_name('js-dprice')
			for b in bb:
				
				if not b.text.strip()[1:].startswith('¥'):
					lists.append(b.text.strip()[1:])
					print b.text.strip()[1:]
			dic[a.find_element_by_class_name('rtype').find_element_by_tag_name('h2').text]=lists

		driver.quit
		return dic

s=Xc()
print s.pc()

还是无法提供结果，因为我是本地文件，跑了一遍没有报错，理论上没有问题，如果你们运行报错请指正，谢谢！

ZJL-阿友

关注

0
点赞
踩
7

收藏

觉得还不错? 一键收藏
0
评论
python爬虫：案例三：去哪儿酒店价格信息

#coding=utf-8import sysreload(sys)sys.setdefaultencoding( "utf-8" )import urllibfrom selenium import webdriverimport timefrom bs4 import BeautifulSoupimport requestsimport osfrom selenium.we
复制链接

扫一扫