爬虫之selenium

最新推荐文章于 2021-05-30 17:19:35 发布

winnertakeall

最新推荐文章于 2021-05-30 17:19:35 发布

阅读量144

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/winnertakeall/article/details/88139890

版权

python 专栏收录该内容

15 篇文章 0 订阅

订阅专栏

from selenium import webdriver
from lxml import etree
import re
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import random

class LagouSpider(object):
    def __init__(self):
        self.driver = webdriver.Chrome()
        self.url = "https://www.lagou.com/jobs/list_python?px=default&city=%E4%B8%8A%E6%B5%B7#filterBox"
        self.positions = []
    def parse_list_page(self, source):
        html = etree.HTML(source)
        links = html.xpath(".//a[@class='position_link']/@href")
        for link in links:
            # print(link)
            source = self.request_detail_page(link)
            time.sleep(1)

    def request_detail_page(self, url):
        # self.driver.get(url)
        # 打开一个新的页面
        self.driver.execute_script("window.open('%s')"%url)
        # 切换到一个新的页面
        self.driver.switch_to.window(self.driver.window_handles[1])
        b = random.randint(10, 20)
        WebDriverWait(self.driver, timeout=b).until(
            EC.presence_of_element_located((By.XPATH, "//span[@class='name']"))
        )
        source = self.driver.page_source
        self.parse_detail_page(source)
        # 关闭当前这个详情页
        self.driver.close()
        # 继续切换回职位页面
        self.driver.switch_to.window(self.driver.window_handles[0])
    def parse_detail_page(self, source):
        html = etree.HTML(source)
        position_name = html.xpath("//span[@class='name']/text()")[0]
        job_request_spans = html.xpath("//dd[@class='job_request']//span")
        salary = job_request_spans[0].xpath(".//text()")[0].strip()
        city = job_request_spans[1].xpath(".//text()")[0].strip()
        city = re.sub(r"[\s/]", "", city)
        work_years = job_request_spans[2].xpath(".//text()")[0].strip()
        work_years = re.sub(r"[\s/]", "", work_years)
        education = job_request_spans[2].xpath(".//text()")[0].strip()
        education = re.sub(r"[\s/]", "", education)
        desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
        company_name = html.xpath("//em[@class='fl-cn']/text()")[0].strip()
        position = {
            "company_name": company_name,
            "name": position_name,
            "salary": salary,
            "city": city,
            "work_years": work_years,
            "education": education,
            "desc": desc,
        }
        self.positions.append(position)
        print(position)
        print("==="*40)
    def run(self):
        self.driver.get(self.url)
        while True:
            source = self.driver.page_source
            # 等待直到这个元素出现，把出现的结果返回给下一个程序用
            WebDriverWait(driver=self.driver, timeout=10).until(
                EC.presence_of_element_located((By.XPATH, "//div[@class='pager_container']/span[last()]"))
            )
            self.parse_list_page(source)
            try:
                next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
                print("------")
                print((next_btn.get_attribute("class")))
                if "pager_next pager_next_disabled" in next_btn.get_attribute("class"):
                    break
                else:
                    next_btn.click()

            except:
                print(source)
            time.sleep(random.randint(10, 20))


if __name__ == '__main__':
    spider = LagouSpider()
    spider.run()

winnertakeall

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
爬虫之selenium

from selenium import webdriverfrom lxml import etreeimport reimport timefrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECfr...
复制链接

扫一扫