selenium爬取拉勾网（反爬问题未解决，求大佬指点）

最新推荐文章于 2021-04-13 22:11:15 发布

大数据舔狗

最新推荐文章于 2021-04-13 22:11:15 发布

阅读量198

点赞数

本文链接：https://blog.csdn.net/weixin_45853749/article/details/107761421

版权

from selenium import webdriver
from lxml import etree
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
import re

headers={
“User-Agent”:“替换成自己的即可”,
“Cookie”:“替换成自己的即可”,
“Referer”:“https://www.lagou.com/”,
“Origin”:“https://www.lagou.com”
}

class LagouSpider(object):
driver_path = r"D:\Chrome Driver\chromedriver.exe"
def init(self):
self.driver=webdriver.Chrome(executable_path=LagouSpider.driver_path)
self.url=“https://www.lagou.com/jobs/list_Python/p-city_3-gm_6?px=default#filterBox”

def run(self):
    self.driver.get(self.url)
    while True:
        try:
            source = self.driver.page_source
            WebDriverWait(driver=self.driver, timeout=10).until(
                EC.presence_of_all_elements_located((By.XPATH, '//div[@class="pager_container"]/span[last()]')))
            self.parse_list_page(source)
            btn = self.driver.find_element_by_xpath('//div[@class="pager_container"]/span[last()]')
            if "pager_next_disabled" in btn.get_attribute("class"):
                break
            else:
                btn.click()
            time.sleep(2)
        except:
            time.sleep(20)

def parse_list_page(self,source):
    try:
        html = etree.HTML(source)
        links = html.xpath("//a[@class='position_link']/@href")
        for link in links:
            self.request_detail_page(link)
            time.sleep(1)
    except:
        time.sleep(20)

def request_detail_page(self,url):
    try:
        self.driver.execute_script("window.open('%s')" % url)
        self.driver._switch_to.window(self.driver.window_handles[1])
        WebDriverWait(self.driver, timeout=10).until(
            EC.presence_of_all_elements_located((By.XPATH, "//h1[@class='name']")))
        source = self.driver.page_source
        self.parse_source_page(source)
        self.driver.close()
        self.driver._switch_to.window(self.driver.window_handles[0])
    except:
        time.sleep(20)

def parse_source_page(self,source):
    html=etree.HTML(source)
    position_name=html.xpath("//h1[@class='name']/text()")[0]
    job_request_spans=html.xpath("//dd[@class='job_request']//span")
    job_salary=job_request_spans[0].xpath("./text()")
    citypre=job_request_spans[1].xpath("./text()")
    city=re.search('/(.+)/',str(citypre)).group(1)
    job_jingyan_pre=job_request_spans[2].xpath("./text()")
    job_jingyan=re.search("\['(.+)/",str(job_jingyan_pre)).group(1)
    job_xueli_pre=job_request_spans[3].xpath("./text()")
    job_xueli=re.search("\['(.+)/",str(job_xueli_pre)).group(1)
    position={
        '职业名称':position_name,
        '工资':job_salary,
        '城市':city,
        '工作经验':job_jingyan,
        '学历':job_xueli
    }
    print(position)

if name == ‘main’:
spider=LagouSpider()
spider.run()