python爬虫之爬取拉勾职位列表以及职位详情(selenium+chrome)

最新推荐文章于 2024-04-04 01:24:48 发布

一个喜欢林俊杰的靓仔

最新推荐文章于 2024-04-04 01:24:48 发布

阅读量250

点赞数

分类专栏： python爬虫文章标签：列表 selenium xpath python chrome

本文链接：https://blog.csdn.net/qq_37680159/article/details/105348468

版权

python爬虫专栏收录该内容

10 篇文章 0 订阅

订阅专栏

from lxml import etree
from selenium import webdriver
import time
import re
from selenium.webdriver.support.ui import Select,WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By


class LagouSpider(object):
    def __init__(self):
        self.driver = webdriver.Chrome()
        self.url = 'https://www.lagou.com/jobs/list_python/p-city_0?&cl=false&fromSearch=true&labelWords=&suginput='
        self.info = []

    def run(self):
        self.driver.get(self.url)
        while True:
            source = self.driver.page_source
            self.parse_list_page(self,source)
            # 将按钮显示等待
            WebDriverWait(driver=self.driver, timeout=10).until(
                EC.presence_of_all_elements_located((By.XPATH, "//div[@class='pager_container']/span[last()]"))
            )
            #按钮元素加载出来以后就获取
            next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']"
                                                         "/span[last()]")
            #按钮置灰说没有下一页
            if "pager_next_disabled" in next_btn.get_attribute("class"):
                pass
            else:
                #点击下一页
                self.driver.execute_script("arguments[0].click();", next_btn)
            time.sleep(1)



    #python中的类默认第一个形参delf必传，调用时传入的实参数对应的是delf后面的形参：
    def parse_list_page(delf,self,source):
        html = etree.HTML(source)
        #获取当页职位列表详情链接
        links = html.xpath("//a[@class='position_link']/@href")
        for link in links:
           #解析每一个详情页面
            self.request_detail_page(self, link)
            time.sleep(1)

    def request_detail_page(delf,self,url):
        print(url)
        #打开新的窗口
        self.driver.execute_script("window.open('%s')"%url)
        #切换窗口，不能让详情页面把列表页面覆盖掉了
        self.driver.switch_to.window(self.driver.window_handles[1])
        source = self.driver.page_source
        #获取详情页数据，数据没有获取那么多，这边随便low出来几个数据
        self.parse_detail_page(self,source)
        #关闭详情页面
        self.driver.close()
        #切换到列表窗口
        self.driver.switch_to.window(self.driver.window_handles[0])


    def parse_detail_page(delf,self,source):
        html = etree.HTML(source)
        position_name = html.xpath("//span[@class='name']/text()")[0]
        job_request_spans = html.xpath("//dd[@class='job_request']//span")
        salary = job_request_spans[0].xpath('.//text()')[0].strip()
        city = job_request_spans[1].xpath(".//text()")[0].strip()
        city = re.sub(r"[\s/]","",city)
        detail_info = {
            'name':position_name,
            'salary':salary,
            'city':city
        }
        self.info.append(detail_info)
        print(self.info)
        print("="*40) #打印40个等号

if __name__ == '__main__':
   spider = LagouSpider()
   spider.run()

一个喜欢林俊杰的靓仔

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python爬虫之爬取拉勾职位列表以及职位详情(selenium+chrome)

from lxml import etreefrom selenium import webdriverimport timeimport refrom selenium.webdriver.support.ui import Select,WebDriverWaitfrom selenium.webdriver.support import expected_conditions as...
复制链接

扫一扫

专栏目录