1.概述
今天来讲下scrapy爬取去哪儿网的火车票,网址 https://train.qunar.com/ , 结果图如下:
2.所用工具
- phantomjs
- selenium
- scrapy
- BeautifulSoup
3.环境搭建(scrapy,BeautifulSoup就不讲了)
phantomjs搭建
http://phantomjs.org/ 下载phantomjs-2.1.1-windows ,并解压到D盘。selenium搭建
pip install selenium
4.开始写代码
1.按常理爬取静态html网页内容时发现获取不到车票信息ul标签里的内容,于是断定车票内容为动态js添加进ul标签的,所以我们应该等js加载完毕,然后在获取html内容,才有车票信息。这时就要用到scrapy的downloadmiddlewrae(可以在request放入下载器之前处理和包装request)。
2.自定义downloadmiddlewrae的编写
# -*- coding: utf-8 -*-
from selenium import webdriver
from scrapy.http import HtmlResponse
import time
class JavaScriptMiddleware(object):
#处理request,然后才放入下载器
def process_request(self, request, spider):
if spider.name == "ticketSpider": ## 我的爬虫名,要设置成你的爬虫名
driver = webdriver.PhantomJS(executable_path='D:\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe') #指定使用的浏览器,上面安装的phantomjs
driver.get(request.url)
time.sleep(1)
js = "var q=document.documentElement.scrollTop=10000"
driver.execute_script(js) #可执行js,模仿用户操作。此处为将页面拉至最底端。
time.sleep(3)
body = driver.page_source
return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)
else:
return
此JavaScriptMiddleware py文件的路径: 爬虫项目/middlewares/middleware.py, 在我这里是:ticket/middlewares/middleware.py, 配置settings.py:
#伪装成浏览器
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
# downloadmiddleware的声明
DOWNLOADER_MIDDLEWARES = {
'ticket.middlewares.middleware.JavaScriptMiddleware': 543, #JavaScriptMiddleware的包路径
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware':None, #禁止内置的中间件
}
# 不遵守爬虫规范
ROBOTSTXT_OBEY = False
3.spidder的编写
利用BeautifulSoup解析。
# -*- coding: utf-8 -*-
import scrapy
import urllib.parse
import logging
from bs4 import BeautifulSoup
import re
###################指定查询条件
START_PALCE = '深圳' ## 起始地点
END_PALCE = '长沙' ## 结束地点
DATE = '2017-09-06' ##时间
class TicketspiderSpider(scrapy.Spider):
name = 'ticketSpider'
allowed_domains = ['train.qunar.com']
start_urls = ['http://train.qunar.com/']
def start_requests(self):
REQ = []
REQ.append(scrapy.Request(self.getIndexPage(START_PALCE,END_PALCE,DATE)));
return REQ
def parse(self, response):
soup = BeautifulSoup(str(response.body, encoding = "utf-8"),'html.parser')
divs = soup.find_all('div', {'class':'js_listinfo'})
for item in divs:
divList = item.find_all('div')
# 车次
trainNumber = self.del_content_blank(str(divList[0].find('h3').text))
# 起点
startPlace = self.del_content_blank(str(divList[1].find('p',{'class':'start'}).find('span').text))
# 终点
endPlace = self.del_content_blank(str(divList[1].find('p',{'class':'end'}).find('span').text))
# 发车时间
startTime = self.del_content_blank(str(divList[2].find_all('time')[0].text))
# 到站时间
endTime = self.del_content_blank(str(divList[2].find_all('time')[1].text))
# 运行时间
duration = self.del_content_blank(str(divList[3].find('time').text))
# 票信息 , 里面是一个字典
#{
# dic['seatType']:'一等座' -- 座位
# dic['seatPrice']:'200' -- 座位价格
# dic['tickets']:100 -- 票数,无票为0
#}
ticketInfo = []
for i in divList[4].find_all('p',{'class':'ticketed'}):
dic={}
# 座位类型 : 二等座,一等座,无座 ...
dic['seatType'] = i.text[0:i.text.index('\xa0')]
# 座位价格
dic['seatPrice'] = re.findall(r"\d+\.?\d*",i.find('span').text)[0]
ticketInfo.append(dic)
#余票数
index = 0
for i in divList[5].find_all('p',{'class':'surplus'}):
dic = ticketInfo[index]
if '张' in str(i.text): # 代表有票
# 剩余票数
dic['tickets'] = int(re.findall(r"\d+\.?\d*",str(i.text))[0])
else: # 没有票,剩余票数为0
dic['tickets'] = 0
index = index + 1
# ---------------------以下是 将数据格式化 打印
STR = trainNumber + ',' + startPlace + ',' + endPlace + ',' + startTime + ',' + endTime + ',' + duration
STR = STR + '['
for item in ticketInfo:
for key in item:
if key == 'seatPrice':
STR = STR + '(票价:' + str(item[key]) + '),'
elif key == 'tickets':
STR = STR + '(余票:' + str(item[key]) + '),'
else:
STR = STR + '(' + str(item[key]) + '),'
STR = STR[0:-1] + ']'
print(STR)
print('==================================')
def del_content_blank(self,s): ##去空格,换行等无用字符
clean_str = re.sub(r'\n| |\xa0|\\xa0|\u3000|\\u3000|\\u0020|\u0020', '', str(s))
return clean_str
def getIndexPage(self,startPlace,endPlace,time):
startPlace = urllib.parse.quote(startPlace) # get 中文编码
endPlace = urllib.parse.quote(endPlace)
return 'https://train.qunar.com/stationToStation.htm?fromStation='+startPlace+'&toStation='+endPlace+'&date='+time+'&stsSearch='
3.执行spider , 会在终端打印车票信息
>>scrapy crawl ticketSpider
老生常谈:深圳有爱好音乐的会打鼓(吉他,键盘,贝斯等)的程序员和其它职业可以一起交流加入我们乐队一起嗨。我的QQ:657455400