这次要爬取拉勾网,拉勾网的反爬做的还是很不错的啊,因为目标网站是Ajax交互的我一开始是直接分析json接口来爬取的,但是真的很麻烦,请求头一旦出点问题就给识别出来了后续我就改了一下方法用selenium来模拟浏览器去获取
思路嘛大概就是 获取主页的源代码——从中获取详情页的url——在去解析 先围绕这三步来写
这里我们已经获取到了主页的源代码
from selenium import webdriver
import requests
from selenium.webdriver import ChromeOptions #这个包用来规避被检测的风险
from lxml import etree
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
import re
class lagouSpitder(object):
option = webdriver.ChromeOptions()
option.add_experimental_option('useAutomationExtension', False)
option.add_experimental_option('excludeSwitches', ['enable-automation'])
driver_path = r'驱动路径' # 定义好路径
def __init__(self):
self.driver=webdriver.Chrome(executable_path=lagouSpitder.driver_path,options=lagouSpitder.option)#初始化路径+规避检测selenium框架
self. driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
self.url='https://www.lagou.com/jobs/list_python/p-city_0?&cl=false&fromSearch=true&labelWords=&suginput='
self.positions=[]
def run(self): #主页面
self.driver.get(self.url)
source = self.driver.page_source # source页面来源 先获取一页
if __name__ == '__main__':
spider=lagouSpitder()
spider.run()
接下来获取详情页的url,定义一个函数parse_list_page 显得美观可维护也强ÿ