由于智联的页面是由js动态加载的,一般的方法只能得到js加载前的页面,为了得到加载过的页面需要通过模拟浏览器来拿到完整的页面.
下面的代码只是简单的实现,爬取智联页面的部分功能,其他根据需要自己实现
中间件(middleswares.py)代码:
from scrapy.http import HtmlResponse
import time
from selenium.common.exceptions import TimeoutException
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
class SeleniumMiddleware(object):
"""
浏览器下载中间件
"""
def __init__(self):
"""
初始化函数
"""
self.options = Options()
self.options.add_argument("--headless")
self.driver = webdriver.Firefox(executable_path=r"D:\tools\geckodriver.exe", options=self.options)
def process_request(self, request, spider):
# 判断是否需要模拟器下载, 如果不需要模拟直接跳过模拟去download下载
if int(request.meta["page"]) != 1:
if int(request.meta["page"]) == 2: # 读取request中meta元数据的page内容
# 执行js使浏览器滚动条滚动到最底部
try:
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(3)
div = self.driver.find_element_by_css_selector(".soupager")
next_page = div.find_elements_by_tag_name("button")
print("#" * 50)
print(next_page)
next_page[1].click()
time.sleep(3)
except Exception as e:
print(e)
else:
if int(request.meta["page"]) == 0:
try:
print("url is ::::::", request.url)
self.driver.get(request.url)
time.sleep(2)
warning_button = self.driver.find_element_by_css_selector(
'.risk-warning__content > button:nth-child(1)')
warning_button.click()
except TimeoutException as e:
print("time out")
time.sleep(5)
return HtmlResponse(url=self.driver.current_url, body=self.driver.page_source, encoding="utf8", request=request)
# 该处代码是详情页也需要模拟时使用
class SeleniumMiddlewareDetail(object):
"""
浏览器详情页下载中间件
"""
def __init__(self):
"""
初始化函数
"""
self.options = Options()
self.options.add_argument("--headless")
self.driver = webdriver.Firefox(executable_path=r"D:\tools\geckodriver.exe", options=self.options)
def process_request(self, request, spider):
if int(request.meta["page"]) == 1:
try:
print("url is ::::::", request.url)
self.driver.get(request.url)
time.sleep(2)
except TimeoutException as e:
print("time out")
time.sleep(5)
return HtmlResponse(url=self.driver.current_url, body=self.driver.page_source, encoding="utf8", request=request)
# 在模拟浏览器过程中如果还想要在downloader实现下载 只要中间件不return就可以
# 出现页面一直加载的情况时,显示页面一直在加载,只要差掉加载过程的小圆圈,页面就会加载出来
# browser.execute_script(('window.stop()') 使用这个方法
爬虫文件(spider.py)代码:
# -*- coding: utf-8 -*-
import time
import scrapy
import lxml.html
from scrapy import Request
class JobDes(object):
def __init__(self):
self.detail_url = ""
self.title = ""
def parse_lxml_zhilian(html_str):
tree = lxml.html.fromstring(html_str)
job_url = tree.xpath('//a[@class="contentpile__content__wrapper__item__info__boxle"]/@href')
job_name = tree.xpath('//a[@class="contentpile__content__wrapper__item__info__boxle"]/@title')
print(job_url)
print(job_name)
#全局变量用于判断翻页速度是否远大于局部下载速度
count = 0
class ZhaopinSpider(scrapy.Spider):
name = 'zhaopin'
# allowed_domains = ['ts.zhaopin.com']
# start_urls = ['http://ts.zhaopin.com/']
def start_requests(self):
url_str = 'https://sou.zhaopin.com/?jl=489&kw=python&kt=3'
yield Request(url=url_str, callback=self.parse, meta={"page": "0"})
def parse(self, response):
#使用模拟器翻页加载ajax页面
#在模拟器弹出页面分析抓取页面
#抓取标签不是一成不变的,谨慎使用带数字的css选择器nth-child(1)
#使用简单可调式的页面去调试
#selenium可以用于模拟测试
rs = response.css('#listContent > div:nth-child(1)')
page_next = response.xpath('//*[@id="pagination_content"]/div/button[2]')
# pagination_content > div > button:nth-child(7)
print("rs is :::::", rs)
print("page_next is :::::", page_next)
# listContent > div:nth-child(1)
# pagination_content > div > button:nth-child(7)
# button.btn:nth-child(8)
#每页60个下载任务,每翻一页多60条任务
global count
count += 60
for r in rs:
job_url = parse_lxml_zhilian(r)
yield Request(url=job_url, callback=self.parse_detal, meta={"page": "1"}, dont_filter=True)
if len(page_next) > 0:
#当下载任务大于300时,暂停翻页等待数据下载
while count > 300:
time.sleep(0.5)
yield Request(url=response.url, callback=self.parse, meta={"page": "2"}, dont_filter=True)
def parse_detal(self):
pass