爬虫之selenium

from selenium import webdriver
from lxml import etree
import re
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import random

class LagouSpider(object):
    def __init__(self):
        self.driver = webdriver.Chrome()
        self.url = "https://www.lagou.com/jobs/list_python?px=default&city=%E4%B8%8A%E6%B5%B7#filterBox"
        self.positions = []
    def parse_list_page(self, source):
        html = etree.HTML(source)
        links = html.xpath(".//a[@class='position_link']/@href")
        for link in links:
            # print(link)
            source = self.request_detail_page(link)
            time.sleep(1)

    def request_detail_page(self, url):
        # self.driver.get(url)
        # 打开一个新的页面
        self.driver.execute_script("window.open('%s')"%url)
        # 切换到一个新的页面
        self.driver.switch_to.window(self.driver.window_handles[1])
        b = random.randint(10, 20)
        WebDriverWait(self.driver, timeout=b).until(
            EC.presence_of_element_located((By.XPATH, "//span[@class='name']"))
        )
        source = self.driver.page_source
        self.parse_detail_page(source)
        # 关闭当前这个详情页
        self.driver.close()
        # 继续切换回职位页面
        self.driver.switch_to.window(self.driver.window_handles[0])
    def parse_detail_page(self, source):
        html = etree.HTML(source)
        position_name = html.xpath("//span[@class='name']/text()")[0]
        job_request_spans = html.xpath("//dd[@class='job_request']//span")
        salary = job_request_spans[0].xpath(".//text()")[0].strip()
        city = job_request_spans[1].xpath(".//text()")[0].strip()
        city = re.sub(r"[\s/]", "", city)
        work_years = job_request_spans[2].xpath(".//text()")[0].strip()
        work_years = re.sub(r"[\s/]", "", work_years)
        education = job_request_spans[2].xpath(".//text()")[0].strip()
        education = re.sub(r"[\s/]", "", education)
        desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
        company_name = html.xpath("//em[@class='fl-cn']/text()")[0].strip()
        position = {
            "company_name": company_name,
            "name": position_name,
            "salary": salary,
            "city": city,
            "work_years": work_years,
            "education": education,
            "desc": desc,
        }
        self.positions.append(position)
        print(position)
        print("==="*40)
    def run(self):
        self.driver.get(self.url)
        while True:
            source = self.driver.page_source
            # 等待直到这个元素出现,把出现的结果返回给下一个程序用
            WebDriverWait(driver=self.driver, timeout=10).until(
                EC.presence_of_element_located((By.XPATH, "//div[@class='pager_container']/span[last()]"))
            )
            self.parse_list_page(source)
            try:
                next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
                print("------")
                print((next_btn.get_attribute("class")))
                if "pager_next pager_next_disabled" in next_btn.get_attribute("class"):
                    break
                else:
                    next_btn.click()

            except:
                print(source)
            time.sleep(random.randint(10, 20))


if __name__ == '__main__':
    spider = LagouSpider()
    spider.run()

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值